<a href="https://colab.research.google.com/github/Dhanashree-Selva/ai-coding-assessment-file/blob/main/ai_code_assessment_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load dataset
df = pd.read_csv("68b1acd44f393_Sample_Support_Emails_Dataset (3).csv")

# Step 1: Create target labels (Urgency)
urgent_words = ["urgent", "immediate", "asap", "help", "blocked", "error"]
df['urgency'] = df['subject'].astype(str).str.lower().apply(
    lambda x: 1 if any(word in x for word in urgent_words) else 0
)

print("Urgency distribution:\n", df['urgency'].value_counts())

# Step 2: Preprocess text (subject + body)
df['text'] = df['subject'].fillna('') + " " + df['body'].fillna('')

nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

df['clean_text'] = df['text'].astype(str).str.lower()
df['clean_text'] = df['clean_text'].apply(lambda x: re.sub(r'[^a-z\s]', '', x))
df['clean_text'] = df['clean_text'].apply(lambda x: " ".join([w for w in x.split() if w not in stop_words]))

# Step 3: Train/Test Split
X = df['clean_text']
y = df['urgency']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: TF-IDF + Logistic Regression
vectorizer = TfidfVectorizer(max_features=1000, stop_words="english")
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = LogisticRegression(max_iter=200)
model.fit(X_train_tfidf, y_train)

# Step 5: Evaluation
y_pred = model.predict(X_test_tfidf)

print("\nModel Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["Normal", "Urgent"]))

# Step 6: Save Predictions
df['predicted_urgency'] = model.predict(vectorizer.transform(df['clean_text']))
df['predicted_urgency'] = df['predicted_urgency'].map({0: "Normal", 1: "Urgent"})

submission = df[['sender', 'subject', 'body', 'predicted_urgency']]
submission.to_csv("ai_model_submission.csv", index=False)

print("\n✅ AI model predictions saved to ai_model_submission.csv")
print("\nPreview:\n", submission.head())


Urgency distribution:
 urgency
1    11
0     9
Name: count, dtype: int64


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.



Model Accuracy: 0.5

Classification Report:
               precision    recall  f1-score   support

      Normal       0.00      0.00      0.00         2
      Urgent       0.50      1.00      0.67         2

    accuracy                           0.50         4
   macro avg       0.25      0.50      0.33         4
weighted avg       0.25      0.50      0.33         4


✅ AI model predictions saved to ai_model_submission.csv

Preview:
               sender                                     subject  \
0     eve@startup.io     Help required with account verification   
1    diana@client.co            General query about subscription   
2     eve@startup.io  Immediate support needed for billing error   
3  alice@example.com       Urgent request: system access blocked   
4     eve@startup.io              Question: integration with API   

                                                body predicted_urgency  
0  Do you support integration with third-party AP...            Urgent  
1  H

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
