# Run pip install -r requirements.txt


In [14]:
import json, re, pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

Load dataset

In [15]:
with open('questions.json', 'r') as file:
    data = json.load(file)

Convert JSON to DataFrame for easier handling and Check data

In [16]:
df = pd.DataFrame(data)
print(df.head())

                                       question_text  \
0  What is the thickness of the drywall in the re...   
1   How many filters does JCI owe in their contract?   
2  what CB will the changes resulting from RFI 18...   
3         What's the flooring type in the penthouse?   
4  what CB will the changes resulting from RFI 18...   

                   created_at                               user_id  
0  2024-01-19 15:56:34.383+00  64201228-e558-4722-962e-69c831e1ea8f  
1  2023-11-08 16:06:17.035+00  64201228-e558-4722-962e-69c831e1ea8f  
2  2024-01-10 14:22:33.006+00  4f697825-9bb6-4fa6-b3ab-a46dc6d42919  
3  2024-01-10 20:55:19.668+00  fcad35ff-bb4b-435d-84f9-2f3b00c24f57  
4  2024-01-16 21:36:23.569+00  4f697825-9bb6-4fa6-b3ab-a46dc6d42919  


PreProcess

In [17]:
df['question_text'] = df['question_text'].str.strip()

# Add labels based on predefined categories

In [18]:
def classify_question(text):
    if re.search(r'thickness|dimensions|material|specification', text, re.IGNORECASE):
        return "Material Specification"
    elif re.search(r'contract|owe|deadline|deliverables', text, re.IGNORECASE):
        return "Contractual Obligations"
    elif re.search(r'who|responsible|which', text, re.IGNORECASE):
        return "Responsibility Assignment"
    elif re.search(r'find|locate|document|status|information', text, re.IGNORECASE):
        return "Document or Information Requests"
    elif re.search(r'how|process|steps|guidance', text, re.IGNORECASE):
        return "Technical Guidance"
    elif re.search(r'when|schedule|timeline|milestone', text, re.IGNORECASE):
        return "Project Timeline"
    else:
        return "Other"

df['category'] = df['question_text'].apply(classify_question)
print(df.head())


                                       question_text  \
0  What is the thickness of the drywall in the re...   
1   How many filters does JCI owe in their contract?   
2  what CB will the changes resulting from RFI 18...   
3         What's the flooring type in the penthouse?   
4  what CB will the changes resulting from RFI 18...   

                   created_at                               user_id  \
0  2024-01-19 15:56:34.383+00  64201228-e558-4722-962e-69c831e1ea8f   
1  2023-11-08 16:06:17.035+00  64201228-e558-4722-962e-69c831e1ea8f   
2  2024-01-10 14:22:33.006+00  4f697825-9bb6-4fa6-b3ab-a46dc6d42919   
3  2024-01-10 20:55:19.668+00  fcad35ff-bb4b-435d-84f9-2f3b00c24f57   
4  2024-01-16 21:36:23.569+00  4f697825-9bb6-4fa6-b3ab-a46dc6d42919   

                  category  
0   Material Specification  
1  Contractual Obligations  
2                    Other  
3                    Other  
4                    Other  


# Split data into features and labels

In [19]:
x = df['question_text']
y = df['category']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Convert text to TF-IDF features


In [20]:
vectorizer = TfidfVectorizer(max_features=500)
x_train_tfidf = vectorizer.fit_transform(x_train)
x_test_tfidf = vectorizer.transform(x_test)

In [21]:
# Train Logistic Regression Classifier
model = LogisticRegression(random_state=42)
model.fit(x_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(x_test_tfidf)


In [22]:
# Evaluate the model
print("Classification Report:\n")
print(classification_report(y_test, y_pred))


Classification Report:

                                  precision    recall  f1-score   support

Document or Information Requests       1.00      0.09      0.17        11
          Material Specification       1.00      0.33      0.50        15
                           Other       0.75      1.00      0.86        63
       Responsibility Assignment       0.00      0.00      0.00         3
              Technical Guidance       0.86      0.95      0.90        19

                        accuracy                           0.78       111
                       macro avg       0.72      0.47      0.48       111
                    weighted avg       0.81      0.78      0.72       111



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# !Test on new questions

In [23]:
new_questions = [
    "What is the thickness of the drywall in the residence?",
    "How many filters does JCI owe in their contract?",
    "Who is responsible for waterproofing?",
    "Where can I find the latest drawings?"
]


new_questions_tfidf = vectorizer.transform(new_questions)
predictions = model.predict(new_questions_tfidf)

# Display results


In [24]:

for question, category in zip(new_questions, predictions):
    print(f"Question: {question} -> Predicted Category: {category}")


Question: What is the thickness of the drywall in the residence? -> Predicted Category: Material Specification
Question: How many filters does JCI owe in their contract? -> Predicted Category: Contractual Obligations
Question: Who is responsible for waterproofing? -> Predicted Category: Other
Question: Where can I find the latest drawings? -> Predicted Category: Document or Information Requests


# Save model & vectorizer if Needed


In [None]:
# Change to True if you want to save the model
need_to_save = False

In [27]:
if (need_to_save):   
    with open('model/question_classifier.pkl', 'wb') as model_file:
        pickle.dump(model, model_file)

    with open('model/vectorizer.pkl', 'wb') as vec_file:
        pickle.dump(vectorizer, vec_file)
