In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, f1_score
from sklearn.pipeline import Pipeline
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
import xgboost as xgb

In [9]:
# Try to import NLTK components, but provide fallbacks if not available
try:
    import nltk
    # Download NLTK resources with error handling
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('punkt', quiet=True)
    print("NLTK resources downloaded")
    
    # Check if stopwords are available
    from nltk.corpus import stopwords
    stopwords_available = True
    STOPWORDS = set(stopwords.words('english'))
    print("NLTK stopwords loaded")
    
    # Check if lemmatizer is available
    from nltk.stem import WordNetLemmatizer
    lemmatizer_available = True
    lemmatizer = WordNetLemmatizer()
    print("NLTK lemmatizer loaded")
    
except Exception as e:
    print(f"NLTK loading error: {e}")
    stopwords_available = False
    lemmatizer_available = False
    
    # Define basic stopwords if NLTK's aren't available
    STOPWORDS = {'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what', 'when', 
              'where', 'how', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 
              'has', 'had', 'do', 'does', 'did', 'to', 'from', 'of', 'at', 'by', 'for', 
              'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 
              'after', 'above', 'below', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 
              'under', 'again', 'further', 'then', 'once', 'here', 'there', 'all', 'any', 
              'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 
              'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can', 'will', 
              'just', 'should', 'now', 'im', 'youre', 'hes', 'shes', 'theyre', 'weve',
              'youve', 'theyve', 'ive', 'doesnt', 'dont', 'cant', 'wont', 'isnt', 'arent'}
    print("Using basic stopwords list instead of NLTK")

NLTK resources downloaded
NLTK stopwords loaded
NLTK lemmatizer loaded


In [10]:
# Load the datasets
print("Loading datasets...")
try:
    train_data = pd.read_csv("C:/Users/ashir/Downloads/train_E6oV3lV.csv")
    test_data = pd.read_csv("C:/Users/ashir/Downloads/test_tweets_anuFYb8.csv")
    
    print(f"Training data shape: {train_data.shape}")
    print(f"Test data shape: {test_data.shape}")

    # Check for missing values
    print("\nMissing values in training data:")
    print(train_data.isnull().sum())
    print("\nMissing values in test data:")
    print(test_data.isnull().sum())

    # Display class distribution in training data
    print("\nClass distribution in training data:")
    print(train_data['label'].value_counts(normalize=True) * 100)
    
except Exception as e:
    print(f"Error loading data: {e}")
    raise

Loading datasets...
Training data shape: (31962, 3)
Test data shape: (17197, 2)

Missing values in training data:
id       0
label    0
tweet    0
dtype: int64

Missing values in test data:
id       0
tweet    0
dtype: int64

Class distribution in training data:
label
0    92.98542
1     7.01458
Name: proportion, dtype: float64


In [11]:
# Define text cleaning function
def clean_text(text):
    if not isinstance(text, str):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    
    # Remove hashtags (keep the text)
    text = re.sub(r'#', '', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [12]:
# Apply text cleaning to both datasets
print("Cleaning text...")
train_data['cleaned_tweet'] = train_data['tweet'].apply(clean_text)
test_data['cleaned_tweet'] = test_data['tweet'].apply(clean_text)

Cleaning text...


In [13]:
# Define preprocessing function with fallbacks
def preprocess_text(text):
    if not isinstance(text, str) or not text:
        return ""
    
    # Simple tokenization by splitting on whitespace
    # This avoids relying on NLTK's word_tokenize
    tokens = text.split()
    
    # Remove stopwords if available
    if stopwords_available:
        try:
            tokens = [token for token in tokens if token not in STOPWORDS]
        except Exception as e:
            print(f"Error removing stopwords: {e}")
    else:
        # Use the basic stopwords list defined earlier
        tokens = [token for token in tokens if token not in STOPWORDS]
    
    # Apply lemmatization if available
    if lemmatizer_available:
        try:
            tokens = [lemmatizer.lemmatize(token) for token in tokens]
        except Exception as e:
            print(f"Error lemmatizing: {e}")
    else:
        # Simple stemming as fallback
        stemmed_tokens = []
        for token in tokens:
            if len(token) > 3:  # Only stem if word is longer than 3 characters
                if token.endswith('ing') and len(token) > 4:
                    token = token[:-3]
                elif token.endswith('ed') and len(token) > 3:
                    token = token[:-2]
                elif token.endswith('es') and len(token) > 3:
                    token = token[:-2]
                elif token.endswith('s') and len(token) > 2:
                    token = token[:-1]
            stemmed_tokens.append(token)
        tokens = stemmed_tokens
    
    # Rejoin tokens to form preprocessed text
    return ' '.join(tokens)


In [14]:
# Apply preprocessing
print("Preprocessing text...")
train_data['processed_tweet'] = train_data['cleaned_tweet'].apply(preprocess_text)
test_data['processed_tweet'] = test_data['cleaned_tweet'].apply(preprocess_text)

Preprocessing text...


In [15]:
# Check if any tweets became empty after preprocessing and handle them
train_data['processed_tweet'] = train_data['processed_tweet'].apply(lambda x: x if x else "empty_tweet")
test_data['processed_tweet'] = test_data['processed_tweet'].apply(lambda x: x if x else "empty_tweet")

In [16]:
# Exploratory Data Analysis 
# Calculate tweet lengths
train_data['tweet_length'] = train_data['tweet'].apply(lambda x: len(x) if isinstance(x, str) else 0)
train_data['word_count'] = train_data['processed_tweet'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)

In [17]:
# Visualize tweet length distribution
try:
    plt.figure(figsize=(12, 6))

    plt.subplot(1, 2, 1)
    sns.histplot(data=train_data, x='tweet_length', hue='label', bins=30, kde=True)
    plt.title('Tweet Length Distribution by Class')
    plt.xlabel('Tweet Length (characters)')
    plt.ylabel('Count')

    plt.subplot(1, 2, 2)
    sns.histplot(data=train_data, x='word_count', hue='label', bins=30, kde=True)
    plt.title('Word Count Distribution by Class')
    plt.xlabel('Word Count')
    plt.ylabel('Count')

    plt.tight_layout()
    plt.savefig('tweet_length_distribution.png')
    plt.close()
    print("Saved tweet length distribution visualization")
except Exception as e:
    print(f"Error creating visualizations: {e}")

Saved tweet length distribution visualization


In [18]:
# Try to create word clouds if wordcloud package is available
try:
    from wordcloud import WordCloud
    
    # Function to generate wordclouds
    def generate_wordcloud(data, label, title):
        text = ' '.join(data[data['label'] == label]['processed_tweet'].dropna())
        wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=200).generate(text)
        
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title(title)
        plt.tight_layout()
        plt.savefig(f'wordcloud_class_{label}.png')
        plt.close()

    # Generate word clouds for each class
    generate_wordcloud(train_data, 0, 'Word Cloud for Non-Hate Speech (Label 0)')
    generate_wordcloud(train_data, 1, 'Word Cloud for Hate Speech (Label 1)')
    print("Generated word clouds for both classes")
except Exception as e:
    print(f"Wordcloud creation skipped: {e}")

Wordcloud creation skipped: No module named 'wordcloud'


In [20]:
# Split training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    train_data['processed_tweet'],
    train_data['label'],
    test_size=0.2,
    random_state=42,
    stratify=train_data['label']
)


In [22]:
# TF-IDF Feature Extraction
tfidf_vectorizer = TfidfVectorizer(
    max_features=10000,
    min_df=5,
    max_df=0.8,
    ngram_range=(1, 2)
)

In [23]:
# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)

In [24]:
# Define models to try
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, C=1.0, class_weight='balanced'),
    'Random Forest': RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
    'SVM': SVC(kernel='linear', C=1.0, probability=True, class_weight='balanced', random_state=42),
    'XGBoost': xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False, random_state=42)
}

In [25]:
# Train and evaluate each model
model_results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_tfidf, y_train)
    
    # Predict on validation set
    y_val_pred = model.predict(X_val_tfidf)
    
    # Calculate F1 score
    f1 = f1_score(y_val, y_val_pred)
    model_results[name] = f1
    
    print(f"{name} F1 Score: {f1:.4f}")
    print(classification_report(y_val, y_val_pred))


Training Logistic Regression...
Logistic Regression F1 Score: 0.5925
              precision    recall  f1-score   support

           0       0.98      0.94      0.96      5945
           1       0.48      0.77      0.59       448

    accuracy                           0.93      6393
   macro avg       0.73      0.85      0.78      6393
weighted avg       0.95      0.93      0.93      6393


Training Random Forest...
Random Forest F1 Score: 0.6786
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      5945
           1       0.80      0.59      0.68       448

    accuracy                           0.96      6393
   macro avg       0.88      0.79      0.83      6393
weighted avg       0.96      0.96      0.96      6393


Training SVM...
SVM F1 Score: 0.5814
              precision    recall  f1-score   support

           0       0.98      0.94      0.96      5945
           1       0.48      0.74      0.58       448

    accuracy    

Parameters: { "use_label_encoder" } are not used.



XGBoost F1 Score: 0.5719
              precision    recall  f1-score   support

           0       0.96      0.99      0.98      5945
           1       0.85      0.43      0.57       448

    accuracy                           0.95      6393
   macro avg       0.90      0.71      0.77      6393
weighted avg       0.95      0.95      0.95      6393



In [26]:
# Visualize model comparison
try:
    plt.figure(figsize=(10, 6))
    models_df = pd.DataFrame(list(model_results.items()), columns=['Model', 'F1 Score'])
    sns.barplot(x='F1 Score', y='Model', data=models_df.sort_values('F1 Score', ascending=False))
    plt.title('Model Performance Comparison (F1 Score)')
    plt.tight_layout()
    plt.savefig('model_comparison.png')
    plt.close()
    print("Saved model comparison visualization")
except Exception as e:
    print(f"Error creating model comparison plot: {e}")

Saved model comparison visualization


In [28]:
# Setup parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', 'balanced_subsample']
}

In [30]:
# Create and run GridSearchCV
try:
    grid_search = GridSearchCV(
        estimator=RandomForestClassifier(random_state=42),
        param_grid=param_grid,
        cv=5,
        scoring='f1',
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(X_train_tfidf, y_train)

    # Get the best parameters and model
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    print(f"\nBest Parameters: {best_params}")

    # Evaluate the best model on validation set
    y_val_pred_best = best_model.predict(X_val_tfidf)
    best_f1 = f1_score(y_val, y_val_pred_best)
    print(f"\nBest Model F1 Score: {best_f1:.4f}")
    print(classification_report(y_val, y_val_pred_best))
except Exception as e:
    print(f"Grid search failed: {e}")
    # Use the best model from earlier comparisons if grid search fails
    best_model_name = max(model_results.items(), key=lambda x: x[1])[0]
    best_model = models[best_model_name]
    best_f1 = model_results[best_model_name]
    print(f"Using {best_model_name} as the best model instead")


Fitting 5 folds for each of 216 candidates, totalling 1080 fits

Best Parameters: {'class_weight': 'balanced_subsample', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}

Best Model F1 Score: 0.6861
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      5945
           1       0.72      0.66      0.69       448

    accuracy                           0.96      6393
   macro avg       0.85      0.82      0.83      6393
weighted avg       0.96      0.96      0.96      6393



In [31]:
# Create ensemble model
print("\nCreating ensemble model...")
try:
    ensemble = VotingClassifier(
        estimators=[
            ('lr', models['Logistic Regression']),
            ('rf', best_model),
            ('svm', models['SVM']),
            ('xgb', models['XGBoost'])
        ],
        voting='soft'
    )

    # Train the ensemble
    ensemble.fit(X_train_tfidf, y_train)

    # Evaluate the ensemble
    y_val_pred_ensemble = ensemble.predict(X_val_tfidf)
    ensemble_f1 = f1_score(y_val, y_val_pred_ensemble)
    print(f"\nEnsemble Model F1 Score: {ensemble_f1:.4f}")
    print(classification_report(y_val, y_val_pred_ensemble))

    # Use the best model for final prediction (either ensemble or best individual model)
    final_model = ensemble if ensemble_f1 > best_f1 else best_model
    final_model_name = "Ensemble" if ensemble_f1 > best_f1 else best_model_name
    print(f"Using {final_model_name} for final predictions")
except Exception as e:
    print(f"Ensemble creation failed: {e}")
    # Use the best individual model if ensemble fails
    final_model = best_model
    print("Using best individual model for final predictions")



Creating ensemble model...


Parameters: { "use_label_encoder" } are not used.




Ensemble Model F1 Score: 0.6924
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      5945
           1       0.78      0.62      0.69       448

    accuracy                           0.96      6393
   macro avg       0.88      0.80      0.84      6393
weighted avg       0.96      0.96      0.96      6393

Using Ensemble for final predictions


In [33]:
# Make final predictions on test data
X_test_tfidf = tfidf_vectorizer.transform(test_data['processed_tweet'])
test_predictions = final_model.predict(X_test_tfidf)

In [37]:
def analyze_feature_importance(model, tfidf_vectorizer, top_n=20):
    """
    Analyze and visualize feature importance from the trained model.
    Works with different model types (tree-based, linear, ensemble).
    
    Parameters:
    - model: Trained classifier model
    - tfidf_vectorizer: Fitted TF-IDF vectorizer
    - top_n: Number of top features to display
    
    Returns:
    - Dictionary with feature importance information
    """
    feature_names = tfidf_vectorizer.get_feature_names_out()
    results = {}
    
    # For tree-based models (Random Forest, XGBoost)
    if hasattr(model, 'feature_importances_'):
        print("Analyzing feature importance for tree-based model...")
        feature_importances = model.feature_importances_
        
        # Get indices of top features
        indices = np.argsort(feature_importances)[-top_n:]
        
        # Create sorted lists for returning
        top_features = [feature_names[i] for i in indices[::-1]]
        top_importance = feature_importances[indices[::-1]]
        
        # Store results
        results = {
            'model_type': 'tree_based',
            'features': top_features,
            'importance': top_importance
        }
        
        # Visualize
        plt.figure(figsize=(12, 8))
        plt.title(f'Top {top_n} Most Important Features')
        plt.barh(range(top_n), feature_importances[indices], align='center')
        plt.yticks(range(top_n), [feature_names[i] for i in indices])
        plt.xlabel('Feature Importance')
        plt.gca().invert_yaxis()  # Display highest importance at the top
        plt.tight_layout()
        plt.savefig('feature_importance.png')
        plt.show()
        
    # For linear models (Logistic Regression, SVM)
    elif hasattr(model, 'coef_'):
        print("Analyzing feature coefficients for linear model...")
        coef = model.coef_[0]
        
        # Get indices of top positive and negative coefficients
        top_positive_indices = np.argsort(coef)[-top_n:]
        top_negative_indices = np.argsort(coef)[:top_n]
        
        # Create sorted lists for returning
        pos_features = [feature_names[i] for i in top_positive_indices[::-1]]
        pos_importance = coef[top_positive_indices[::-1]]
        neg_features = [feature_names[i] for i in top_negative_indices]
        neg_importance = coef[top_negative_indices]
        
        # Store results
        results = {
            'model_type': 'linear',
            'positive_features': pos_features,
            'positive_coefficients': pos_importance,
            'negative_features': neg_features,
            'negative_coefficients': neg_importance
        }
        
        # Visualize
        plt.figure(figsize=(12, 10))
        
        plt.subplot(2, 1, 1)
        plt.title(f'Top {top_n} Features Associated with Hate Speech (Label 1)')
        plt.barh(range(top_n), coef[top_positive_indices[::-1]], align='center')
        plt.yticks(range(top_n), [feature_names[i] for i in top_positive_indices[::-1]])
        plt.xlabel('Coefficient Value')
        
        plt.subplot(2, 1, 2)
        plt.title(f'Top {top_n} Features Associated with Non-Hate Speech (Label 0)')
        plt.barh(range(top_n), coef[top_negative_indices], align='center')
        plt.yticks(range(top_n), [feature_names[i] for i in top_negative_indices])
        plt.xlabel('Coefficient Value')
        
        plt.tight_layout()
        plt.savefig('feature_coefficients.png')
        plt.show()
        
    # For VotingClassifier or other ensemble models
    elif hasattr(model, 'estimators_'):
        print("Analyzing ensemble model components...")
        
        # Try to find a component with feature importance
        for name, estimator in zip(model.estimator_names_, model.estimators_):
            if hasattr(estimator, 'feature_importances_'):
                print(f"Using feature importance from {name} component")
                feature_importances = estimator.feature_importances_
                
                # Get indices of top features
                indices = np.argsort(feature_importances)[-top_n:]
                
                # Create sorted lists for returning
                top_features = [feature_names[i] for i in indices[::-1]]
                top_importance = feature_importances[indices[::-1]]
                
                # Store results
                results = {
                    'model_type': f'ensemble_{name}',
                    'features': top_features,
                    'importance': top_importance
                }
                
                # Visualize
                plt.figure(figsize=(12, 8))
                plt.title(f'Top {top_n} Most Important Features (from {name})')
                plt.barh(range(top_n), feature_importances[indices[::-1]], align='center')
                plt.yticks(range(top_n), [feature_names[i] for i in indices[::-1]])
                plt.xlabel('Feature Importance')
                plt.tight_layout()
                plt.savefig(f'feature_importance_{name}.png')
                plt.show()
                
                break
                
            elif hasattr(estimator, 'coef_'):
                print(f"Using feature coefficients from {name} component")
                coef = estimator.coef_[0]
                
                # Get indices of top positive and negative coefficients
                top_positive_indices = np.argsort(coef)[-top_n:]
                top_negative_indices = np.argsort(coef)[:top_n]
                
                # Create sorted lists for returning
                pos_features = [feature_names[i] for i in top_positive_indices[::-1]]
                pos_importance = coef[top_positive_indices[::-1]]
                neg_features = [feature_names[i] for i in top_negative_indices]
                neg_importance = coef[top_negative_indices]
                
                # Store results
                results = {
                    'model_type': f'ensemble_{name}',
                    'positive_features': pos_features,
                    'positive_coefficients': pos_importance,
                    'negative_features': neg_features,
                    'negative_coefficients': neg_importance
                }
                
                # Visualize
                plt.figure(figsize=(12, 10))
                
                plt.subplot(2, 1, 1)
                plt.title(f'Top {top_n} Features Associated with Hate Speech (from {name})')
                plt.barh(range(top_n), coef[top_positive_indices[::-1]], align='center')
                plt.yticks(range(top_n), [feature_names[i] for i in top_positive_indices[::-1]])
                plt.xlabel('Coefficient Value')
                
                plt.subplot(2, 1, 2)
                plt.title(f'Top {top_n} Features Associated with Non-Hate Speech (from {name})')
                plt.barh(range(top_n), coef[top_negative_indices], align='center')
                plt.yticks(range(top_n), [feature_names[i] for i in top_negative_indices])
                plt.xlabel('Coefficient Value')
                
                plt.tight_layout()
                plt.savefig(f'feature_coefficients_{name}.png')
                plt.show()
                
                break
    
    else:
        print("Model doesn't provide feature importance information.")
        results = {'model_type': 'unknown', 'message': 'No feature importance available'}
    
    # Print top features
    if 'features' in results:
        print("\nTop features for classification:")
        for i, (feature, importance) in enumerate(zip(results['features'], results['importance']), 1):
            print(f"{i}. {feature}: {importance:.4f}")
    
    elif 'positive_features' in results:
        print("\nTop features associated with hate speech (Label 1):")
        for i, (feature, importance) in enumerate(zip(results['positive_features'], results['positive_coefficients']), 1):
            print(f"{i}. {feature}: {importance:.4f}")
            
        print("\nTop features associated with non-hate speech (Label 0):")
        for i, (feature, importance) in enumerate(zip(results['negative_features'], results['negative_coefficients']), 1):
            print(f"{i}. {feature}: {importance:.4f}")
    
    return results

In [38]:
def create_feature_importance_df(model, tfidf_vectorizer):
    """Create a DataFrame with all features and their importance values"""
    feature_names = tfidf_vectorizer.get_feature_names_out()
    
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        df = pd.DataFrame({'feature': feature_names, 'importance': importances})
        df = df.sort_values('importance', ascending=False)
        
    elif hasattr(model, 'coef_'):
        coef = model.coef_[0]
        df = pd.DataFrame({'feature': feature_names, 'coefficient': coef})
        df = df.sort_values('coefficient', ascending=False)
        
    else:
        # For ensemble models, try to get importance from a component
        if hasattr(model, 'estimators_'):
            for estimator in model.estimators_:
                if hasattr(estimator, 'feature_importances_'):
                    importances = estimator.feature_importances_
                    df = pd.DataFrame({'feature': feature_names, 'importance': importances})
                    df = df.sort_values('importance', ascending=False)
                    return df
                elif hasattr(estimator, 'coef_'):
                    coef = estimator.coef_[0]
                    df = pd.DataFrame({'feature': feature_names, 'coefficient': coef})
                    df = df.sort_values('coefficient', ascending=False)
                    return df
        
        # If no importance info is found
        return pd.DataFrame({'feature': feature_names, 'importance': np.zeros(len(feature_names))})
    
    return df

In [39]:
# Example usage of the above function
importance_df = create_feature_importance_df(final_model, tfidf_vectorizer)

In [40]:
importance_df.head(30)

Unnamed: 0,feature,coefficient
117,allahsoil,8.330002
6627,white,7.13404
4915,racism,6.052557
4916,racist,5.619901
606,black,5.318755
6690,woman,5.211439
6231,trump,5.159736
559,bigot,4.540331
3385,latest,4.263644
3968,misogyny,4.095157
