In [1]:
import pandas as pd

# Load the training data
try:
    train_data = pd.read_csv('train.csv')
    print("Training data loaded successfully.")
except FileNotFoundError:
    print("Error: 'train.csv' not found.")

# Load the test data
try:
    test_data = pd.read_csv('test.csv')
    print("Test data loaded successfully.")
except FileNotFoundError:
    print("Error: 'test.csv' not found.")

# Display the first few rows of the training and test data
print("\nFirst 5 rows of the training data:")
print(train_data.head())

print("\nFirst 5 rows of the test data:")
print(test_data.head())

# Display information about the datasets
print("\nTraining Data Information:")
train_data.info()

print("\nTest Data Information:")
test_data.info()

# Check for missing values
print("\nMissing values in the training data:")
print(train_data.isnull().sum())

print("\nMissing values in the test data:")
print(test_data.isnull().sum())


Training data loaded successfully.
Test data loaded successfully.

First 5 rows of the training data:
                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  

First 5 rows of the test data:
                 id            

In [28]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean text data
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords and lemmatize
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Apply cleaning to training data
train_data['comment_text'] = train_data['comment_text'].apply(clean_text)

# Apply cleaning to test data
test_data['comment_text'] = test_data['comment_text'].apply(clean_text)

# Verify cleaning by displaying the first few rows
print("\nCleaned Training Data:")
print(train_data['comment_text'].head())

print("\nCleaned Test Data:")
print(test_data['comment_text'].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



Cleaned Training Data:
0    explanation edits made username hardcore metal...
1    daww match background colour im seemingly stuc...
2    hey man im really trying edit war guy constant...
3    cant make real suggestion improvement wondered...
4                  sir hero chance remember page thats
Name: comment_text, dtype: object

Cleaned Test Data:
0    yo bitch ja rule succesful youll ever whats ha...
1                                   rfc title fine imo
2                         source zawe ashton lapland —
3    look back source information updated correct f...
4                        dont anonymously edit article
Name: comment_text, dtype: object


In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer with a max feature limit
vectorizer = TfidfVectorizer(max_features=10000 , ngram_range=(1, 2))

# Fit and transform the training data
X_train = vectorizer.fit_transform(train_data['comment_text'])

# Transform the test data using the same fitted vectorizer
X_test = vectorizer.transform(test_data['comment_text'])

# Check the shape of the resulting TF-IDF matrices
print("\nTF-IDF matrix shape for training data:", X_train.shape)
print("TF-IDF matrix shape for test data:", X_test.shape)



TF-IDF matrix shape for training data: (159571, 10000)
TF-IDF matrix shape for test data: (153164, 10000)


In [44]:
pip install xgboost

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.


DEPRECATION: Loading egg at c:\program files\python312\lib\site-packages\vboxapi-1.0-py3.12.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330

[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip



Collecting xgboost
  Downloading xgboost-2.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.2-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.3/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.3/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.3/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.5/124.9 MB 508.0 kB/s eta 0:04:05
   ---------------------------------------- 0.5/124.9 MB 508.0 kB/s eta 0:04:05
   ---------------------------------------- 0.5/124.9 MB 508.0 kB/s eta 0:04:05
   ---------------------------------------- 0.8/124.9 MB 466.0 kB/s eta 0:04:27
   ---------------------------------------- 0.8/124.9 MB 466.0 kB/s eta 0:04:27
   ----------------------

In [46]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_tfidf, train_data[labels], test_size=0.2, random_state=42)

# Initialize and fit the XGBoost model
xgb_models = {}

# Iterate through all labels and train separate models for each
for label in labels:
    model = xgb.XGBClassifier(eval_metric='mlogloss', scale_pos_weight=1)
    
    # Train the model
    model.fit(X_train, y_train[label])
    
    # Store the model
    xgb_models[label] = model
    print(f"XGBoost model trained for label: {label}")

# Save each model
import joblib

for label, model in xgb_models.items():
    joblib.dump(model, f"{label}_xgb_model.pkl")
    print(f"XGBoost model for '{label}' saved successfully as '{label}_xgb_model.pkl'")

# Evaluate the models on the validation data
for label in labels:
    model = xgb_models[label]
    
    # Predict on validation set
    y_pred = model.predict(X_val)
    
    # Evaluate accuracy and print classification report
    accuracy = accuracy_score(y_val[label], y_pred)
    print(f"Accuracy for {label}: {accuracy}")
    print(classification_report(y_val[label], y_pred))

# Optionally: Evaluate overall accuracy
# Calculate the average accuracy across all labels
overall_accuracy = sum([accuracy_score(y_val[label], xgb_models[label].predict(X_val)) for label in labels]) / len(labels)
print(f"Overall accuracy: {overall_accuracy}")


XGBoost model trained for label: toxic
XGBoost model trained for label: severe_toxic
XGBoost model trained for label: obscene
XGBoost model trained for label: threat
XGBoost model trained for label: insult
XGBoost model trained for label: identity_hate
XGBoost model for 'toxic' saved successfully as 'toxic_xgb_model.pkl'
XGBoost model for 'severe_toxic' saved successfully as 'severe_toxic_xgb_model.pkl'
XGBoost model for 'obscene' saved successfully as 'obscene_xgb_model.pkl'
XGBoost model for 'threat' saved successfully as 'threat_xgb_model.pkl'
XGBoost model for 'insult' saved successfully as 'insult_xgb_model.pkl'
XGBoost model for 'identity_hate' saved successfully as 'identity_hate_xgb_model.pkl'
Accuracy for toxic: 0.9549428168572771
              precision    recall  f1-score   support

           0       0.96      0.99      0.98     28859
           1       0.91      0.59      0.71      3056

    accuracy                           0.95     31915
   macro avg       0.93      0.7

In [5]:
# Initialize an empty DataFrame to store predictions
predictions = pd.DataFrame()

# Making predictions for each label
for label in labels:
    # Get the model for the current label
    model = models[label]
    
    # Make predictions on the test data for the current label
    predictions[label] = model.predict(X_test)

# Display the first few predictions to check
print("\nPredictions for Test Data:")
print(predictions.head())

# Optional: Save the predictions to a CSV file for submission
predictions.to_csv("predictions.csv", index=False)



Predictions for Test Data:
   toxic  severe_toxic  obscene  threat  insult  identity_hate
0      1             0        1       0       1              0
1      0             0        0       0       0              0
2      0             0        0       0       0              0
3      0             0        0       0       0              0
4      0             0        0       0       0              0


In [6]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Load the actual labels from the test_labels.csv
test_labels = pd.read_csv('test_labels.csv')

# Initialize a dictionary to store evaluation metrics
metrics = {}

# Evaluate the model for each label
for label in labels:
    # Get the true values from the test_labels file for the current label
    true_values = test_labels[label]
    
    # Get the predicted values from the predictions DataFrame
    predicted_values = predictions[label]
    
    # Calculate Mean Squared Error (MSE) and Mean Absolute Error (MAE)
    mse = mean_squared_error(true_values, predicted_values)
    mae = mean_absolute_error(true_values, predicted_values)
    
    # Store the metrics in the dictionary
    metrics[label] = {'MSE': mse, 'MAE': mae}

# Display the evaluation metrics
print("\nEvaluation Metrics:")
for label in metrics:
    print(f"{label}: MSE = {metrics[label]['MSE']}, MAE = {metrics[label]['MAE']}")



Evaluation Metrics:
toxic: MSE = 0.9945222114857277, MAE = 0.7379345015800057
severe_toxic: MSE = 0.6038037658979917, MAE = 0.591411820009924
obscene: MSE = 0.8231634065446188, MAE = 0.6717701287508814
threat: MSE = 0.5862670079130866, MAE = 0.5846347705727194
insult: MSE = 0.7676020474785198, MAE = 0.6538808075004571
identity_hate: MSE = 0.6034512026324724, MAE = 0.5919471938575644


In [37]:
# Convert the predictions to probabilities (using the trained models)
final_predictions = pd.DataFrame()

for label in labels:
    # Get the model for the current label
    model = models[label]
    
    # Get the predicted probabilities for the test data
    predicted_probs = model.predict_proba(X_test)[:, 1]  # Probabilities for class 1 (toxic)
    
    # Add the predicted probabilities to the final_predictions DataFrame
    final_predictions[label] = predicted_probs

# Display the first few rows to ensure it's in the correct format
print("\nFinal Predictions for Submission:")
print(final_predictions.head())

# Save the final predictions to a CSV file for submission
final_predictions.to_csv("final_submission.csv", index=False)



Final Predictions for Submission:
      toxic  severe_toxic   obscene    threat    insult  identity_hate
0  0.828086      0.032968  0.690753  0.008386  0.392487       0.032928
1  0.032382      0.005157  0.026415  0.001994  0.018443       0.004022
2  0.063789      0.006694  0.024206  0.002426  0.027343       0.006063
3  0.027883      0.004327  0.011034  0.001784  0.011141       0.003046
4  0.073659      0.005766  0.026303  0.002199  0.030256       0.005184


In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize and fit the TF-IDF vectorizer on the training text data
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['comment_text'])

# Now, proceed with training the models using X_train_tfidf
for label in labels:
    model = LogisticRegression(solver='liblinear')
    model.fit(X_train_tfidf, train_data[label])
    models[label] = model
    print(f"Model trained for label: {label}")

# Save each model and the vectorizer as shown previously
import joblib

for label, model in models.items():
    joblib.dump(model, f"{label}_model.pkl")
    print(f"Model for '{label}' saved successfully as '{label}_model.pkl'")

joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.pkl")
print("TF-IDF vectorizer saved successfully as 'tfidf_vectorizer.pkl'")


Model trained for label: toxic
Model trained for label: severe_toxic
Model trained for label: obscene
Model trained for label: threat
Model trained for label: insult
Model trained for label: identity_hate
Model for 'toxic' saved successfully as 'toxic_model.pkl'
Model for 'severe_toxic' saved successfully as 'severe_toxic_model.pkl'
Model for 'obscene' saved successfully as 'obscene_model.pkl'
Model for 'threat' saved successfully as 'threat_model.pkl'
Model for 'insult' saved successfully as 'insult_model.pkl'
Model for 'identity_hate' saved successfully as 'identity_hate_model.pkl'
TF-IDF vectorizer saved successfully as 'tfidf_vectorizer.pkl'


In [48]:
from sklearn.metrics import accuracy_score

# Assuming the models are trained and the TF-IDF vectorizer is fitted
# Prepare the test data and labels (same steps as training)
test_data = pd.read_csv('test.csv')  # Load the test data
test_labels = pd.read_csv('test_labels.csv')  # Load the test labels

# Preprocess the test comments
X_test_tfidf = tfidf_vectorizer.transform(test_data['comment_text'])

# Initialize variables to track accuracy for each label
accuracies = {}

# Iterate through each label and evaluate the accuracy of each model
for label in labels:
    # Predict the probabilities for the test data
    y_pred = models[label].predict(X_test_tfidf)
    y_true = test_labels[label]
    
    # Calculate the accuracy for this label
    accuracy = accuracy_score(y_true, y_pred)
    accuracies[label] = accuracy
    print(f"Accuracy for {label}: {accuracy:.4f}")

# Calculate the overall accuracy (average of all labels' accuracies)
overall_accuracy = sum(accuracies.values()) / len(accuracies)
print(f"Overall accuracy: {overall_accuracy:.4f}")


Accuracy for toxic: 0.3918
Accuracy for severe_toxic: 0.4152
Accuracy for obscene: 0.4027
Accuracy for threat: 0.4162
Accuracy for insult: 0.4007
Accuracy for identity_hate: 0.4134
Overall accuracy: 0.4067


In [15]:
from sklearn.metrics import classification_report

# Loop through each label to evaluate the model for each label separately
for label in labels:
    # Get true labels for the current label and filter out any -1 values
    y_true_label = test_labels[test_labels[label] != -1][label]
    
    # Predict labels for test data using the trained model for the current label
    # Only on rows where y_true_label is valid (not -1)
    y_pred_label = models[label].predict(X_test_tfidf[test_labels[label] != -1])
    
    # Display the classification report for the current label
    print(f"Classification report for label: {label}")
    print(classification_report(y_true_label, y_pred_label, target_names=["not_" + label, label]))
    print("\n" + "-"*60 + "\n")  # Separator for readability


Classification report for label: toxic
              precision    recall  f1-score   support

   not_toxic       0.96      0.95      0.95     57888
       toxic       0.56      0.61      0.59      6090

    accuracy                           0.92     63978
   macro avg       0.76      0.78      0.77     63978
weighted avg       0.92      0.92      0.92     63978


------------------------------------------------------------

Classification report for label: severe_toxic
                  precision    recall  f1-score   support

not_severe_toxic       1.00      1.00      1.00     63611
    severe_toxic       0.40      0.15      0.22       367

        accuracy                           0.99     63978
       macro avg       0.70      0.57      0.61     63978
    weighted avg       0.99      0.99      0.99     63978


------------------------------------------------------------

Classification report for label: obscene
              precision    recall  f1-score   support

 not_obscene   