In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
import joblib

In [65]:
df = pd.read_csv('dataset.csv')
manual = pd.read_csv('manual_testing.csv')

In [66]:
df.drop('Unnamed: 0', axis=1, inplace=True)
manual.drop('Unnamed: 0', axis=1, inplace=True)

In [67]:
df.shape

(60544, 2)

In [68]:
manual.shape

(200, 2)

# CHECKING IF THE MANUAL DATA IS STILL IN THE DATA TO BE USED TO TRAIN THE MODELS

In [69]:
print(len(manual))
print(len(df))

200
60544


In [70]:
print(type(manual['text'][0]))
print(type(df['text'][0]))
same = []
for i in range(0, len(manual)):
    for j in range(0, len(df)):
        if df['text'][j] == manual['text'][i]:
            print(i, '   ', j)
            same.append(j)

<class 'str'>
<class 'str'>
175     31775
178     13648
186     27846


In [71]:
df.drop(same, axis=0, inplace=True)
df = df.reset_index()

# CHECK FOR NULL VALUES

In [72]:
print(df.isnull().sum())

index    0
text     5
class    0
dtype: int64


In [73]:
df = df.dropna()

In [74]:
print(df.isnull().sum())

index    0
text     0
class    0
dtype: int64


In [75]:
df = df.reset_index()
df.shape

(60536, 4)

In [76]:
df.drop('level_0', axis=1, inplace=True)
df

Unnamed: 0,index,text,class
0,0,london s metropolitan police said saturday nig...,1
1,1,washington new york reuters usa president dona...,1
2,2,it was an eyebrow raising moment she did not t...,1
3,3,washington reuters president elect donald trum...,1
4,4,belfast northern ireland rioters set a hijacke...,1
...,...,...,...
60531,60539,build that wall and when mexico sends its peop...,0
60532,60540,andy cohen has gotten an intimate look at luan...,0
60533,60541,kansas secretary of state kris kobach has repo...,0
60534,60542,get celebs updates directly to your inbox subs...,0


In [77]:
df.drop('index', axis=1, inplace=True)
df

Unnamed: 0,text,class
0,london s metropolitan police said saturday nig...,1
1,washington new york reuters usa president dona...,1
2,it was an eyebrow raising moment she did not t...,1
3,washington reuters president elect donald trum...,1
4,belfast northern ireland rioters set a hijacke...,1
...,...,...
60531,build that wall and when mexico sends its peop...,0
60532,andy cohen has gotten an intimate look at luan...,0
60533,kansas secretary of state kris kobach has repo...,0
60534,get celebs updates directly to your inbox subs...,0


# TOKENIZATION AND LEMMATIZATION

In [20]:
import nltk

In [21]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bishe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bishe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [22]:
from nltk.stem import WordNetLemmatizer

In [23]:
lemmatizer = WordNetLemmatizer()

In [24]:
for index,row in df.iterrows():
    filter_sentence = ''
    
    sentence = row['text']
    
    words = nltk.word_tokenize(sentence)#tokenization
    
    for word in words:
        filter_sentence = filter_sentence + ' ' + str(lemmatizer.lemmatize(word)).lower()#lemmatization
        
    df.loc[index, 'text'] = filter_sentence

In [25]:
df

Unnamed: 0,text,class
0,london s metropolitan police said saturday ni...,1
1,washington new york reuters usa president don...,1
2,it wa an eyebrow raising moment she did not t...,1
3,washington reuters president elect donald tru...,1
4,belfast northern ireland rioter set a hijacke...,1
...,...,...
60531,build that wall and when mexico sends it peop...,0
60532,andy cohen ha gotten an intimate look at luan...,0
60533,kansa secretary of state kris kobach ha repor...,0
60534,get celebs update directly to your inbox subs...,0


In [26]:
x = df['text']
y = df['class']
m = manual['text']

In [27]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2)

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorization_tfidf = TfidfVectorizer(stop_words='english')
xv_train = vectorization_tfidf.fit_transform(x_train)
xv_test = vectorization_tfidf.transform(x_test)
manual_v = vectorization_tfidf.transform(m)

In [40]:
print("Training data dimensions:", xv_train.shape)
print("Testing data dimensions:", xv_test.shape)
print("Manual testing data dimensions:", manual_v.shape)

Training data dimensions: (48428, 150541)
Testing data dimensions: (12108, 150541)
Manual testing data dimensions: (200, 150541)


In [79]:
y_test.to_csv("y_test.csv")
x_train.to_csv('training_data.csv')
x_test.to_csv('testing_data.csv')

In [42]:
from scipy.sparse import csr_matrix, save_npz

In [43]:
save_npz('testing_data.npz', xv_test)
save_npz('manual_testing_data.npz', manual_v)

In [44]:
feature_names = vectorization_tfidf.get_feature_names_out()

# Extract top features for training set
num_top_features = 100
top_features_train = [feature_names[i] for i in xv_train.sum(axis=0).argsort()[0, -num_top_features:]][::-1]

In [45]:
top_features_train

[array([['democratic', 'america', 'korea', 'health', 'foreign', 'http',
         'military', 'monday', 'member', 'city', 'work', 'attack', 'come',
         'friday', 'need', 'support', 'thursday', 'case', 'presidential',
         'wednesday', 'tuesday', 'minister', 'national', 'according',
         'democrat', 'administration', 'report', 'video', 'percent',
         'child', 'thing', 'leader', 'russian', 'life', 'source',
         'medium', 'senate', 'com', 'north', 'china', 'way', 'world',
         'security', 'did', 'month', 'family', 'hillary', 'million',
         'tax', 'statement', 'don', 'group', 'image', 'police', 'law',
         'going', 'russia', 'think', 'court', 'vote', 'make', 'want',
         'week', 'know', 'twitter', 'washington', 'woman', 'official',
         'campaign', 'day', 'american', 'right', 'united', 'party',
         'election', 'white', 'say', 'told', 'news', 'country', 'donald',
         'government', 'obama', 'like', 'just', 'clinton', 'house',
         'tim

In [46]:
num_top_features = 100
top_features_test = [feature_names[i] for i in xv_test.sum(axis=0).argsort()[0, -num_top_features:]][::-1]

In [47]:
top_features_test

[array([['korea', 'old', 'monday', 'including', 'work', 'http',
         'military', 'attack', 'foreign', 'come', 'friday', 'health',
         'thursday', 'city', 'wednesday', 'presidential', 'tuesday',
         'member', 'minister', 'source', 'case', 'percent', 'report',
         'administration', 'video', 'need', 'support', 'child',
         'democrat', 'national', 'senate', 'according', 'north', 'month',
         'medium', 'leader', 'way', 'russian', 'did', 'vote', 'life',
         'statement', 'don', 'thing', 'million', 'world', 'com', 'law',
         'family', 'security', 'china', 'image', 'group', 'going', 'tax',
         'think', 'hillary', 'court', 'russia', 'want', 'police', 'make',
         'know', 'week', 'twitter', 'official', 'campaign', 'washington',
         'woman', 'right', 'day', 'united', 'american', 'election',
         'party', 'white', 'told', 'say', 'news', 'country', 'government',
         'donald', 'obama', 'like', 'just', 'time', 'reuters', 'house',
         '

# SUPPORT VECTOR MACHINE

In [48]:
from sklearn.svm import SVC

In [49]:
SV = SVC(kernel='linear')
SV.fit(xv_train, y_train)

In [50]:
joblib.dump(SV, 'Support_Vector.joblib')

['Support_Vector.joblib']

# LOGISTIC REGRESSION

In [51]:
from sklearn.linear_model import LogisticRegression

In [52]:
LR = LogisticRegression()
LR.fit(xv_train, y_train)

In [53]:
joblib.dump(LR, 'Logistics_Regression.joblib')

['Logistics_Regression.joblib']

# DECISION TREE CLASSIFIER

In [54]:
from sklearn.tree import DecisionTreeClassifier

In [55]:
DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)

In [56]:
joblib.dump(DT, 'DecisionTree_Classifier.joblib')

['DecisionTree_Classifier.joblib']

# GRADIENT BOOSTING CLASSIFIER

In [57]:
from sklearn.ensemble import GradientBoostingClassifier

In [58]:
GBC = GradientBoostingClassifier(random_state=0)
GBC.fit(xv_train, y_train)

In [59]:
joblib.dump(GBC, 'GradientBoosting_Classifier.joblib')

['GradientBoosting_Classifier.joblib']

# RANDOM FOREST CLASSIFIER

In [60]:
from sklearn.ensemble import RandomForestClassifier

In [61]:
RFC = RandomForestClassifier(random_state=0)
RFC.fit(xv_train, y_train)

In [62]:
joblib.dump(RFC, 'RandomForest_Classifier.joblib')

['RandomForest_Classifier.joblib']