In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import nltk

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Preprocessing function with lemmatization and stopwords removal
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove digits
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Load datasets
train_data = pd.read_csv(r"C:\Users\hp081\Downloads\AWM_train.csv")
dev_data = pd.read_csv(r"C:\Users\hp081\Downloads\AWM_dev.csv")
test_data = pd.read_csv(r"C:\Users\hp081\Downloads\AWM_test_without_labels.csv")

# Combine train and dev datasets
train_data = pd.concat([train_data, dev_data], ignore_index=True)

# Apply preprocessing
train_data['Text'] = train_data['Text'].apply(preprocess_text)
test_data['Text'] = test_data['Text'].apply(preprocess_text)

# Encode labels
train_data['Class'] = train_data['Class'].map({'Abusive': 1, 'Non-Abusive': 0})

# TF-IDF Vectorizer with n-grams and stopword removal
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english', smooth_idf=True)
X = train_data['Text']
y = train_data['Class']
X_tfidf = vectorizer.fit_transform(X)

# Cross-validation for model evaluation
def evaluate_model_with_cross_validation(model, X, y):
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f"Cross-Validation Accuracy: {np.mean(scores):.4f} ± {np.std(scores):.4f}")

# Logistic Regression Model with Hyperparameter Tuning
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['liblinear', 'saga']
}

grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_tfidf, y)
print("Best hyperparameters for Logistic Regression:", grid_search.best_params_)

# Train and Evaluate Best Model
best_lr_model = grid_search.best_estimator_
evaluate_model_with_cross_validation(best_lr_model, X_tfidf, y)

# Train and Evaluate SVM Model
svm_model = SVC(kernel='linear')
evaluate_model_with_cross_validation(svm_model, X_tfidf, y)

# Train and Evaluate Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
evaluate_model_with_cross_validation(rf_model, X_tfidf, y)

# Train and Evaluate Gradient Boosting Model
gb_model = GradientBoostingClassifier()
evaluate_model_with_cross_validation(gb_model, X_tfidf, y)

# Predict on Test Data using the best model
best_model = best_lr_model  # Replace with the model that performs best during your evaluation
X_test_tfidf = vectorizer.transform(test_data['Text'])
test_predictions = best_model.predict(X_test_tfidf)

# Map predicted classes back to labels
label_mapping = {0: 'Non-Abusive', 1: 'Abusive'}
test_data['Predicted_Class_Label'] = test_predictions  # Add predictions as a column first
test_data['Predicted_Class_Label'] = test_data['Predicted_Class_Label'].map(label_mapping)  # Map to labels

# Save predictions as a CSV file
output_dir = r"C:\Users\hp081\Documents"
output_csv_path = os.path.join(output_dir, "AWM_test_predictions.csv")

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)

# Save labels in CSV format, including all relevant columns
test_data[['id', 'Text', 'Predicted_Class_Label']].to_csv(output_csv_path, index=False)
print(f"Predictions saved to {output_csv_path}")

# Function to preprocess the input text
def preprocess_text_input(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove digits
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Function to predict if the input comment is abusive or not
def predict_comment(comment, model, vectorizer):
    # Preprocess the input comment
    processed_comment = preprocess_text_input(comment)
    
    # Transform the comment using the TF-IDF vectorizer
    comment_vector = vectorizer.transform([processed_comment])
    
    # Predict using the trained model
    prediction = model.predict(comment_vector)
    
    # Map the prediction to the label
    label_mapping = {0: 'Non-Abusive', 1: 'Abusive'}
    predicted_label = label_mapping[prediction[0]]
    
    return predicted_label

# Example usage
if __name__ == "__main__":
    print("Enter a comment to check if it is abusive or not:")
    input_comment = input("> ")
    result = predict_comment(input_comment, best_model, vectorizer)
    print(f"The comment is predicted to be: {result}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp081\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp081\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp081\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Best hyperparameters for Logistic Regression: {'C': 1, 'solver': 'liblinear'}
Cross-Validation Accuracy: 0.6258 ± 0.0438
Cross-Validation Accuracy: 0.6199 ± 0.0427
Cross-Validation Accuracy: 0.5986 ± 0.0472
Cross-Validation Accuracy: 0.5859 ± 0.0330
Predictions saved to C:\Users\hp081\Documents\AWM_test_predictions.csv
Enter a comment to check if it is abusive or not:


>  അവൾ ഒരു മോശം പെൺകുട്ടിയല്ല


The comment is predicted to be: Non-Abusive


In [2]:
from sklearn.metrics import classification_report

# Split the development dataset into features and labels
X_dev = dev_data['Text']
y_dev = dev_data['Class'].map({'Abusive': 1, 'Non-Abusive': 0})  # Encode labels

# Transform the development dataset using the TF-IDF vectorizer
X_dev_tfidf = vectorizer.transform(X_dev)

# Predict on the development dataset using the best model
dev_predictions = best_model.predict(X_dev_tfidf)

# Print the classification report
print("Classification Report on Development Data:")
print(classification_report(y_dev, dev_predictions, target_names=['Non-Abusive', 'Abusive']))


Classification Report on Development Data:
              precision    recall  f1-score   support

 Non-Abusive       0.61      0.59      0.60       326
     Abusive       0.57      0.59      0.58       303

    accuracy                           0.59       629
   macro avg       0.59      0.59      0.59       629
weighted avg       0.59      0.59      0.59       629



In [5]:
import os
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, accuracy_score
import nltk

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Enhanced preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove digits
    tokens = word_tokenize(text)  # Use word tokenization
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Load datasets
train_data = pd.read_csv(r"C:\Users\hp081\Downloads\AWM_train.csv")
dev_data = pd.read_csv(r"C:\Users\hp081\Downloads\AWM_dev.csv")
test_data = pd.read_csv(r"C:\Users\hp081\Downloads\AWM_test_without_labels.csv")

# Combine train and dev datasets
train_data = pd.concat([train_data, dev_data], ignore_index=True)

# Apply preprocessing
train_data['Text'] = train_data['Text'].apply(preprocess_text)
test_data['Text'] = test_data['Text'].apply(preprocess_text)

# Encode labels
train_data['Class'] = train_data['Class'].map({'Abusive': 1, 'Non-Abusive': 0})

# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english', smooth_idf=True)
X = train_data['Text']
y = train_data['Class']
X_tfidf = vectorizer.fit_transform(X)

# Function to evaluate model with cross-validation
def evaluate_model_with_cross_validation(model, X, y):
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f"Cross-Validation Accuracy: {np.mean(scores):.4f} ± {np.std(scores):.4f}")

    # Fit the model on the full training data
    model.fit(X, y)
    y_pred = model.predict(X)
    print("\nClassification Report (Cross-Validation):")
    print(classification_report(y, y_pred))
    print(f"Accuracy (Cross-Validation): {accuracy_score(y, y_pred):.4f}")

# Train and evaluate SVM model
print("\nEvaluating SVM Model:")
svm_model = SVC(kernel='linear')
evaluate_model_with_cross_validation(svm_model, X_tfidf, y)

# Predict on test data
X_test_tfidf = vectorizer.transform(test_data['Text'])
test_predictions = svm_model.predict(X_test_tfidf)

# Map predicted classes back to labels
label_mapping = {0: 'Non-Abusive', 1: 'Abusive'}
test_data['Predicted_Class_Label'] = test_predictions
test_data['Predicted_Class_Label'] = test_data['Predicted_Class_Label'].map(label_mapping)

# Save predictions as a CSV file
output_dir = r"C:\Users\hp081\Documents"
output_csv_path = os.path.join(output_dir, "AWM_test1_predictions(svm).csv")
os.makedirs(output_dir, exist_ok=True)
test_data[['id', 'Text', 'Predicted_Class_Label']].to_csv(output_csv_path, index=False)
print(f"Predictions saved to {output_csv_path}")

# Function to preprocess and predict input text
def preprocess_text_input(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

def predict_comment(comment, model, vectorizer):
    processed_comment = preprocess_text_input(comment)
    comment_vector = vectorizer.transform([processed_comment])
    prediction = model.predict(comment_vector)
    label_mapping = {0: 'Non-Abusive', 1: 'Abusive'}
    return label_mapping[prediction[0]]




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp081\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp081\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp081\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



Evaluating SVM Model:
Cross-Validation Accuracy: 0.6199 ± 0.0427

Classification Report (Cross-Validation):
              precision    recall  f1-score   support

           0       0.89      0.90      0.90      1728
           1       0.90      0.90      0.90      1834

    accuracy                           0.90      3562
   macro avg       0.90      0.90      0.90      3562
weighted avg       0.90      0.90      0.90      3562

Accuracy (Cross-Validation): 0.8989
Predictions saved to C:\Users\hp081\Documents\AWM_test1_predictions(svm).csv
The comment 'You are bad person!' is classified as: Non-Abusive


In [6]:
# Example usage
if __name__ == "__main__":
    print("Enter a comment to check if it is abusive or not:")
    input_comment = input("> ")
    result = predict_comment(input_comment, best_model, vectorizer)
    print(f"The comment is predicted to be: {result}")

Enter a comment to check if it is abusive or not:


>  ഞാൻ നിന്നെ കൊല്ലാൻ പോകുന്നു


The comment is predicted to be: Non-Abusive
