In [108]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk

In [109]:
df = pd.read_csv(r"C:\Users\ARTHI\Downloads\cleaned_data.csv")
df.head()

Unnamed: 0,Text,Fake,Hate,Target,Severity
0,USER Abhi tak 2000 ke note me mujhe GPS nano c...,0,0,0,0
1,USER USER Abe katiye tumse kuch huaa toh jata ...,1,1,2,3
2,USER Ye sab sazish hai bina saman ke koi kaise...,1,1,2,2
3,abe jao tum to dasko pahle hi fash gye the jab...,1,0,1,1
4,Ab ye afbah kaun faila Raha hai ki Shahhen bag...,0,0,0,0


In [110]:
nltk.download('wordnet')
nltk.download('stopwords')

# Text Preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ARTHI\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ARTHI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [111]:
df['Text'] = df['Text'].fillna('').astype(str)  # Handle missing values
df['Text'] = df['Text'].apply(preprocess_text)

# Step 2: Vectorizing the text using TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=10000, min_df=3, max_df=0.9, stop_words='english')
X_vectorized = vectorizer.fit_transform(df['Text'])


In [112]:
ros = RandomOverSampler(random_state=42)
X_balanced, y_balanced = ros.fit_resample(X_vectorized, df['Target'])

# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

In [113]:
nb = MultinomialNB()
param_grid = {'alpha': [0.1, 0.5, 1.0, 1.5, 2.0]}  # Alpha controls Laplace smoothing
grid_search = GridSearchCV(nb, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)


In [114]:
best_nb = grid_search.best_estimator_
best_nb.fit(X_train, y_train)

y_pred = best_nb.predict(X_test)
print("Best Alpha:", grid_search.best_params_['alpha'])
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Best Alpha: 0.1
Accuracy: 0.7412854030501089

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.51      0.60       471
           1       0.73      0.84      0.78       451
           2       0.72      0.73      0.72       465
           3       0.78      0.90      0.83       449

    accuracy                           0.74      1836
   macro avg       0.74      0.74      0.74      1836
weighted avg       0.74      0.74      0.73      1836


Confusion Matrix:
 [[240  76  82  73]
 [ 30 380  19  22]
 [ 55  53 339  18]
 [  4  12  31 402]]


In [115]:

from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
X_tfidf = tfidf.fit_transform(X_vectorized)

# Step 2: Balance the dataset for Severity classification
ros = RandomOverSampler(random_state=42)
X_balanced, y_balanced = ros.fit_resample(X_tfidf, df['Severity'])

from sklearn.model_selection import StratifiedKFold, train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced
)

nb = MultinomialNB()
param_grid = {
    'alpha': [0.01, 0.1, 0.5, 1.0, 1.5, 2.0],
    'fit_prior': [True, False]  
}

grid_search = GridSearchCV(nb, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_nb_sev = grid_search.best_estimator_

y_sev_pred = best_nb_sev.predict(X_test)

# Step 8: Print the results
print("Best Hyperparameters for Severity:", grid_search.best_params_)
print("Accuracy for Severity:", accuracy_score(y_test, y_sev_pred))
print("\nClassification Report for Severity:\n", classification_report(y_test, y_sev_pred))
print("\nConfusion Matrix for Severity:\n", confusion_matrix(y_test, y_sev_pred))

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Example with Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print("\nLogistic Regression Accuracy for Severity:", accuracy_score(y_test, y_pred_lr))


Best Hyperparameters for Severity: {'alpha': 0.1, 'fit_prior': False}
Accuracy for Severity: 0.6993464052287581

Classification Report for Severity:
               precision    recall  f1-score   support

           0       0.68      0.57      0.62       459
           1       0.56      0.55      0.55       459
           2       0.69      0.74      0.71       459
           3       0.85      0.95      0.89       459

    accuracy                           0.70      1836
   macro avg       0.69      0.70      0.69      1836
weighted avg       0.69      0.70      0.69      1836


Confusion Matrix for Severity:
 [[260 121  48  30]
 [ 81 251  95  32]
 [ 33  70 339  17]
 [  6   9  10 434]]

Logistic Regression Accuracy for Severity: 0.7227668845315904


In [119]:
import pandas as pd

# Ensure there are no missing values in the 'Tweet' column
validation_data['Tweet'] = validation_data['Tweet'].fillna("")

# Generate predictions for target and severity classifications
X_validation_vectorized = vectorizer.transform(validation_data['Tweet'])

y_target_pred = best_nb.predict(X_validation_vectorized)  # Replace with your trained Target model
y_severity_pred = best_nb_sev.predict(X_validation_vectorized)  # Replace with your trained Severity model

# Add predictions to the validation dataset
validation_data['Target_Prediction'] = y_target_pred
validation_data['Severity_Prediction'] = y_severity_pred

# Save the combined predictions to a single CSV file
output_path = r"C:\Users\ARTHI\Downloads\validation_target_severity_predictions.csv"
validation_data[['Tweet', 'Target_Prediction', 'Severity_Prediction']].to_csv(output_path, index=False)

print(f"Combined predictions saved to '{output_path}'.")


Combined predictions saved to 'C:\Users\ARTHI\Downloads\validation_target_severity_predictions.csv'.


In [121]:
import pandas as pd

# Load the test dataset
test_file_path = r"C:\Users\ARTHI\Downloads\Test_Task_B.xlsx"
test_data = pd.read_excel(test_file_path)

#  Handle missing values in the 'Tweet' column
test_data['Tweet'] = test_data['Tweet'].fillna("")

X_test_vectorized = vectorizer.transform(test_data['Tweet'])

y_target_pred_test = best_nb.predict(X_test_vectorized)  
y_severity_pred_test = best_nb_sev.predict(X_test_vectorized)  

test_data['Target_Prediction'] = y_target_pred_test
test_data['Severity_Prediction'] = y_severity_pred_test

# Save the predictions to a single CSV file
output_path_test = r"C:\Users\ARTHI\Downloads\test_target_severity_predictions.csv"
test_data[['Tweet', 'Target_Prediction', 'Severity_Prediction']].to_csv(output_path_test, index=False)

print(f"Test predictions saved to '{output_path_test}'.")


Test predictions saved to 'C:\Users\ARTHI\Downloads\test_target_severity_predictions.csv'.
