In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# load data
df = pd.read_csv('removed_training.csv')

# check for and replace missing values in the 'tweet' column
df['tweet'] = df['tweet'].fillna('')  # replace NaN with empty string

# print unique values from the 'gender' column
if 'gender' in df.columns:
    unique_genders = df['gender'].unique()
    print("Unique values in 'gender' column:", unique_genders)
else:
    print("'gender' column not found in the DataFrame.")



Unique values in 'gender' column: ['female' 'bot' 'male']


In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# load the dataset and replace missing values in the 'tweet' column
df = pd.read_csv('removed_training.csv')
df['tweet'] = df['tweet'].fillna('')  # replace NaN with empty strings

# extract tweet text and gender labels
tweets = df['tweet']
gender = df['gender']

# convert tweets to TF-IDF features
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(tweets)
y = gender

# split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=39, stratify=y)

# define classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(random_state=39, max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=39, n_estimators=100),
    "Gradient Boosting Classifier": GradientBoostingClassifier(random_state=39),
    "SVM (Linear Kernel)": SVC(random_state=39, kernel='linear', probability=True)
}

# train and evaluate each classifier
results = {}

for name, clf in classifiers.items():
    print(f"=== {name} ===")
    # train the classifier
    clf.fit(X_train, y_train)
    
    # evaluate on validation data
    y_pred_val = clf.predict(X_val)
    accuracy_val = round(accuracy_score(y_val, y_pred_val), 2)
    
    print("Validation Results:")
    print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred_val))
    print("Classification Report:\n", classification_report(y_val, y_pred_val))
    print("Validation Accuracy:", accuracy_val)
    print()
    
    # store results
    results[name] = {
        "model": clf,
        "accuracy": accuracy_val
    }

# load the test dataset and process similarly
df_test = pd.read_csv('removed_test.csv')
df_test['tweet'] = df_test['tweet'].fillna('')  # replace NaN with empty strings

# extract test tweets and gender
tweets_test = df_test['tweet']
gender_test = df_test['gender']

# convert test tweets to TF-IDF features
X_test = vectorizer.transform(tweets_test)

# predict and evaluate each classifier on test data
for name, metrics in results.items():
    clf = metrics["model"]
    print(f"=== Test Results for {name} ===")
    y_test_pred = clf.predict(X_test)
    accuracy_test = round(accuracy_score(gender_test, y_test_pred), 2)
    
    print("Confusion Matrix:\n", confusion_matrix(gender_test, y_test_pred))
    print("Classification Report:\n", classification_report(gender_test, y_test_pred))
    print("Test Accuracy:", accuracy_test)
    print()

# feature importance analysis for Tree-Based Models
for name in ["Random Forest", "Gradient Boosting Classifier"]:
    if name in results:
        clf = results[name]["model"]
        feature_importances = clf.feature_importances_
        feature_names = vectorizer.get_feature_names_out()
        
        # create a DataFrame for feature importances
        importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': feature_importances
        }).sort_values(by='Importance', ascending=False)
        
        print(f"\n=== Top Features for {name} ===")
        print(importance_df.head(10))
        importance_df.to_csv(f'top_features_{name.replace(" ", "_").lower()}.csv', index=False)


=== Logistic Regression ===
Validation Results:
Confusion Matrix:
 [[394  10   8]
 [ 10 175  21]
 [  7  26 173]]
Classification Report:
               precision    recall  f1-score   support

         bot       0.96      0.96      0.96       412
      female       0.83      0.85      0.84       206
        male       0.86      0.84      0.85       206

    accuracy                           0.90       824
   macro avg       0.88      0.88      0.88       824
weighted avg       0.90      0.90      0.90       824

Validation Accuracy: 0.9

=== Random Forest ===
Validation Results:
Confusion Matrix:
 [[379  15  18]
 [ 14 166  26]
 [ 13  42 151]]
Classification Report:
               precision    recall  f1-score   support

         bot       0.93      0.92      0.93       412
      female       0.74      0.81      0.77       206
        male       0.77      0.73      0.75       206

    accuracy                           0.84       824
   macro avg       0.82      0.82      0.82       824

In [3]:
# get feature names and coefficients
log_reg_model = results["Logistic Regression"]["model"]
feature_names = vectorizer.get_feature_names_out()
coefficients = log_reg_model.coef_ 
classes = log_reg_model.classes_

# find top features for each class
n_top_features = 10
for idx, label in enumerate(classes):
    print(f"Top {n_top_features} features for class '{label}':")
    top_indices = np.argsort(coefficients[idx])[-n_top_features:][::-1]  # descending order
    top_features = [(feature_names[i], coefficients[idx][i]) for i in top_indices]
    for feature, coef in top_features:
        print(f"{feature}: {coef}")
    print()


Top 10 features for class 'bot':
small: 1.5534565107828098
introduction: 1.5258475559051394
cnn: 1.1508878805796838
born: 1.1073603695772705
hea: 1.102907741668665
history: 1.0832427842507042
world: 1.0321138321919079
eyes: 0.9712056960095425
developer: 0.9617263501346872
york: 0.9542586113081575

Top 10 features for class 'female':
women: 3.0176813503397155
thank: 3.012383406331091
xx: 2.0645947984007993
omg: 1.9792032203179168
video: 1.8864329567324674
instagram: 1.851036238295689
rights: 1.842833811906392
just: 1.7808719429894717
social: 1.6206442007798707
day: 1.5474327011476339

Top 10 features for class 'male':
iot: 2.5061171508761424
mate: 2.422898514342831
good: 2.1347021715107317
city: 1.7329655532521164
nice: 1.6545965520248
thanks: 1.583224136456463
dc: 1.5163200087354887
fans: 1.4110436868136529
new: 1.4049069399912817
beer: 1.4020865119977892

