In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split

# Load the data from the 'data' subfolder
df = pd.read_csv('data/train.csv')

# Define the text cleaning function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text) # Keep only letters and spaces
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace
    return text

# Apply the function to the comment text
df['cleaned_text'] = df['comment_text'].apply(preprocess_text)
print("✅ Data loaded and cleaned.")

# Split the data for training and testing
X = df['cleaned_text']
y = df['toxic']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("✅ Data split into training and testing sets.")

✅ Data loaded and cleaned.
✅ Data split into training and testing sets.


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. Initialize the vectorizer
# We'll limit it to the 5000 most frequent words to keep it manageable.
vectorizer = TfidfVectorizer(max_features=5000)

# 2. Fit the vectorizer on the training data and transform it
X_train_tfidf = vectorizer.fit_transform(X_train)

# 3. Transform the test data using the same fitted vectorizer
X_test_tfidf = vectorizer.transform(X_test)

# Print a confirmation
print("✅ Text vectorization complete.")
print(f"   - Shape of training data: {X_train_tfidf.shape}")
print(f"   - Shape of testing data: {X_test_tfidf.shape}")

✅ Text vectorization complete.
   - Shape of training data: (127656, 5000)
   - Shape of testing data: (31915, 5000)


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# 1. Initialize and train the model
print("Training the model...")
model = LogisticRegression(max_iter=1000) # Increased max_iter for convergence
model.fit(X_train_tfidf, y_train)
print("✅ Model training complete.")

# 2. Make predictions on the test data
print("\nMaking predictions...")
y_pred = model.predict(X_test_tfidf)
print("✅ Predictions made.")

# 3. Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy * 100:.2f}%")

# Display a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Training the model...
✅ Model training complete.

Making predictions...
✅ Predictions made.

Model Accuracy: 95.69%

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.98     28859
           1       0.90      0.62      0.73      3056

    accuracy                           0.96     31915
   macro avg       0.93      0.81      0.85     31915
weighted avg       0.96      0.96      0.95     31915



In [5]:
# Create a few example sentences to test
test_sentences = [
    "You are a wonderful and talented person, keep up the great work!",
    "You are a stupid idiot and I hate you."
]

# A quick function to predict a single sentence
def predict_toxicity(sentence):
    # 1. Clean the sentence
    cleaned_sentence = preprocess_text(sentence)
    
    # 2. Vectorize the sentence using the SAME vectorizer
    sentence_tfidf = vectorizer.transform([cleaned_sentence])
    
    # 3. Predict using the trained model
    prediction = model.predict(sentence_tfidf)[0]
    
    # 4. Get the prediction probability
    probability = model.predict_proba(sentence_tfidf)[0]
    
    # Return a user-friendly result
    if prediction == 1:
        return f"Prediction: Toxic (Confidence: {probability[1]*100:.2f}%)"
    else:
        return f"Prediction: Not-Toxic (Confidence: {probability[0]*100:.2f}%)"

# Loop through the test sentences and see the results
for sentence in test_sentences:
    print(f"Sentence: '{sentence}'")
    print(predict_toxicity(sentence))
    print("-" * 30)

Sentence: 'You are a wonderful and talented person, keep up the great work!'
Prediction: Not-Toxic (Confidence: 89.47%)
------------------------------
Sentence: 'You are a stupid idiot and I hate you.'
Prediction: Toxic (Confidence: 100.00%)
------------------------------


In [6]:
import pickle

# Save the TfidfVectorizer
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# Save the Logistic Regression model
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("✅ Model and Vectorizer have been saved successfully to your project folder.")
print("   - vectorizer.pkl")
print("   - model.pkl")

✅ Model and Vectorizer have been saved successfully to your project folder.
   - vectorizer.pkl
   - model.pkl
