In [1]:
import pandas as pd 
import numpy as np
import spacy

In [2]:
df=pd.read_csv("data/twitter_training.csv")

In [4]:
df.head()

Unnamed: 0,id,country,label,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       74682 non-null  int64 
 1   country  74682 non-null  object
 2   label    74682 non-null  object
 3   text     73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [6]:
df.isna().sum()

id           0
country      0
label        0
text       686
dtype: int64

In [7]:
df.drop_duplicates(inplace=True)

In [8]:
df.isna().sum()

id           0
country      0
label        0
text       326
dtype: int64

In [9]:
df.shape

(71981, 4)

In [10]:
df.dropna(inplace=True)

In [11]:
df.shape

(71655, 4)

In [12]:
df.isna().sum()

id         0
country    0
label      0
text       0
dtype: int64

In [13]:
df.columns

Index(['id', 'country', 'label', 'text'], dtype='object')

In [14]:
nlp = spacy.load("en_core_web_sm")

In [15]:
def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [16]:
df['Preprocessed Text'] = df['text'].apply(preprocess) 

In [17]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
le_model = LabelEncoder()
df['label'] = le_model.fit_transform(df['label'])

In [18]:
X_train, X_test, y_train, y_test = train_test_split(df['Preprocessed Text'], df['label'],test_size=0.2, random_state=42, stratify=df['label'])

In [19]:
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

Shape of X_train:  (57324,)
Shape of X_test:  (14331,)


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
vectorizer = TfidfVectorizer(max_features=1000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [22]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)
y_pred = model.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.55


In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [24]:
rf = RandomForestClassifier()
rf.fit(X_train_vectorized, y_train)

In [25]:
y_pred_rf = rf.predict(X_test_vectorized)

In [26]:
print("Accuracy:", accuracy_score(y_test, y_pred_rf))

Accuracy: 0.8452306189379667
