In [1]:
import pandas as pd


In [2]:
df = pd.read_csv("train.csv")


In [3]:
df = df[['text','label']]


In [4]:
df.head()


Unnamed: 0,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,Ever get the feeling your life circles the rou...,0
2,"Why the Truth Might Get You Fired October 29, ...",1
3,Videos 15 Civilians Killed In Single US Airstr...,1
4,Print \nAn Iranian woman has been sentenced to...,1


In [5]:
df = df.sample(n=7000)
df.shape


(7000, 2)

In [6]:
df.isnull().sum()


text     16
label     0
dtype: int64

In [7]:
df.dropna(inplace=True)


In [9]:

import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove links
    text = re.sub(r'http\S+', '', text)
    
    # Tokenize the text
    words = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    
    # Initialize Porter Stemmer
    stemmer = PorterStemmer()
    
    # Perform stemming
    stemmed_words = [stemmer.stem(word) for word in filtered_words]
    
    # Join the stemmed words back into a single string
    cleaned_text = ' '.join(stemmed_words)
    
    return cleaned_text


In [10]:
df['clean_text'] = df['text'].apply(lambda x: clean_text(x))


In [11]:
df['label'].value_counts()


label
0    3506
1    3478
Name: count, dtype: int64

In [12]:

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report


# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['label'], test_size=0.2, random_state=42)

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [13]:
# Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_tfidf, y_train)

# Predictions
y_pred = rf_classifier.predict(X_test_tfidf)

# Confusion matrix and classification report
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


Confusion Matrix:
[[666  37]
 [ 98 596]]

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.95      0.91       703
           1       0.94      0.86      0.90       694

    accuracy                           0.90      1397
   macro avg       0.91      0.90      0.90      1397
weighted avg       0.91      0.90      0.90      1397



In [14]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression classifier
lr_classifier = LogisticRegression(random_state=42)
lr_classifier.fit(X_train_tfidf, y_train)

# Predictions
y_pred_lr = lr_classifier.predict(X_test_tfidf)

# Confusion matrix and classification report for logistic regression
conf_matrix_lr = confusion_matrix(y_test, y_pred_lr)
class_report_lr = classification_report(y_test, y_pred_lr)

print("Logistic Regression - Confusion Matrix:")
print(conf_matrix_lr)
print("\nLogistic Regression - Classification Report:")
print(class_report_lr)


Logistic Regression - Confusion Matrix:
[[659  44]
 [ 42 652]]

Logistic Regression - Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.94      0.94       703
           1       0.94      0.94      0.94       694

    accuracy                           0.94      1397
   macro avg       0.94      0.94      0.94      1397
weighted avg       0.94      0.94      0.94      1397



In [None]:
from sklearn.svm import SVC

# SVM classifier
svm_classifier = SVC(kernel='linear', random_state=42)
svm_classifier.fit(X_train_tfidf, y_train)

# Predictions
y_pred_svm = svm_classifier.predict(X_test_tfidf)

# Confusion matrix and classification report for SVM
conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)
class_report_svm = classification_report(y_test, y_pred_svm)

print("SVM - Confusion Matrix:")
print(conf_matrix_svm)
print("\nSVM - Classification Report:")
print(class_report_svm)
