In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
import joblib

In [3]:
df = pd.read_csv('dataset.csv')
manual = pd.read_csv('manual_testing.csv')

In [4]:
df.drop('Unnamed: 0', axis=1, inplace=True)
manual.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
df.shape

(38617, 6)

In [6]:
manual.shape

(20, 6)

# CHECKING IF THE MANUAL DATA IS STILL IN THE DATA TO BE USED TO TRAIN THE MODELS

In [7]:
print(len(manual))
print(len(df))

20
38617


In [10]:
print(type(manual['text'][0]))
print(type(df['text'][0]))
same = []
for i in range(0, len(manual)):
    for j in range(0, len(df)):
        if manual['text'][i] == df['text'][j]:
            print(i, '   ', j)
            same.append(j)

<class 'str'>
<class 'str'>


In [9]:
df.drop(same, axis=0, inplace=True)
df = df.reset_index()

# CHECK FOR NULL VALUES

In [11]:
print(df.isnull().sum())

index    0
title    0
text     1
month    0
year     0
day      0
class    0
dtype: int64


In [12]:
df = df.dropna()

In [13]:
print(df.isnull().sum())

index    0
title    0
text     0
month    0
year     0
day      0
class    0
dtype: int64


In [14]:
df = df.reset_index()
df.shape

(38615, 8)

In [15]:
df.drop('level_0', axis=1, inplace=True)
df

Unnamed: 0,index,title,text,month,year,day,class
0,0,as usa budget fight looms republicans flip the...,washington reuters the head of a conservative ...,12.0,2017,31,1
1,1,usa military to accept transgender recruits on...,washington reuters transgender people will be ...,12.0,2017,29,1
2,2,senior usa republican senator let mr mueller d...,washington reuters the special counsel investi...,12.0,2017,31,1
3,3,fbi russia probe helped by australian diplomat...,washington reuters trump campaign adviser geor...,12.0,2017,30,1
4,4,trump wants postal service to charge much more...,seattle washington reuters president donald tr...,12.0,2017,29,1
...,...,...,...,...,...,...,...
38610,38612,seven iranians freed in the prisoner swap have...,century wire says this week the historic inter...,1.0,2016,20,0
38611,38613,hashtag hell the fake left,by dady chery and gilbert mercierall writers w...,1.0,2016,19,0
38612,38614,astroturfing journalist reveals brainwashing t...,vic bishop waking timesour reality is carefull...,1.0,2016,19,0
38613,38615,the new american century an era of fraud,paul craig robertsin the last years of the cen...,1.0,2016,19,0


# TOKENIZATION AND LEMMATIZATION

In [16]:
import nltk

In [17]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bishe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bishe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
from nltk.stem import WordNetLemmatizer

In [19]:
lemmatizer = WordNetLemmatizer()

In [20]:
for index,row in df.iterrows():
    filter_sentence = ''
    
    sentence = row['text']
    
    words = nltk.word_tokenize(sentence)#tokenization
    
    for word in words:
        filter_sentence = filter_sentence + ' ' + str(lemmatizer.lemmatize(word)).lower()#lemmatization
        
    df.loc[index, 'text'] = filter_sentence

In [21]:
df

Unnamed: 0,index,title,text,month,year,day,class
0,0,as usa budget fight looms republicans flip the...,washington reuters the head of a conservative...,12.0,2017,31,1
1,1,usa military to accept transgender recruits on...,washington reuters transgender people will be...,12.0,2017,29,1
2,2,senior usa republican senator let mr mueller d...,washington reuters the special counsel invest...,12.0,2017,31,1
3,3,fbi russia probe helped by australian diplomat...,washington reuters trump campaign adviser geo...,12.0,2017,30,1
4,4,trump wants postal service to charge much more...,seattle washington reuters president donald t...,12.0,2017,29,1
...,...,...,...,...,...,...,...
38610,38612,seven iranians freed in the prisoner swap have...,century wire say this week the historic inter...,1.0,2016,20,0
38611,38613,hashtag hell the fake left,by dady chery and gilbert mercierall writer w...,1.0,2016,19,0
38612,38614,astroturfing journalist reveals brainwashing t...,vic bishop waking timesour reality is careful...,1.0,2016,19,0
38613,38615,the new american century an era of fraud,paul craig robertsin the last year of the cen...,1.0,2016,19,0


In [22]:
x = df['text']
y = df['class']
m = manual['text']

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2)

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorization_tfidf = TfidfVectorizer()
xv_train = vectorization_tfidf.fit_transform(x_train)
xv_test = vectorization_tfidf.transform(x_test)
manual_v = vectorization_tfidf.transform(m)

In [25]:
print("Training data dimensions:", xv_train.shape)
print("Testing data dimensions:", xv_test.shape)
print("Manual testing data dimensions:", manual_v.shape)

Training data dimensions: (30892, 88373)
Testing data dimensions: (7723, 88373)
Manual testing data dimensions: (20, 88373)


In [26]:
y_test.to_csv("y_test.csv")
print(type(x_train))
x_train.to_csv('training_data.csv')

<class 'pandas.core.series.Series'>


In [27]:
from scipy.sparse import csr_matrix, save_npz

In [28]:
save_npz('testing_data.npz', xv_test)
save_npz('manual_testing_data.npz', manual_v)

# SUPPORT VECTOR MACHINE

In [29]:
from sklearn.svm import SVC

In [30]:
SV = SVC(kernel='linear')
SV.fit(xv_train, y_train)

In [31]:
joblib.dump(SV, 'Support_Vector.joblib')

['Support_Vector.joblib']

# LOGISTIC REGRESSION

In [32]:
from sklearn.linear_model import LogisticRegression

In [33]:
LR = LogisticRegression()
LR.fit(xv_train, y_train)

In [34]:
joblib.dump(LR, 'Logistics_Regression.joblib')

['Logistics_Regression.joblib']

# DECISION TREE CLASSIFIER

In [35]:
from sklearn.tree import DecisionTreeClassifier

In [36]:
DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)

In [37]:
joblib.dump(DT, 'DecisionTree_Classifier.joblib')

['DecisionTree_Classifier.joblib']

# GRADIENT BOOSTING CLASSIFIER

In [38]:
from sklearn.ensemble import GradientBoostingClassifier

In [39]:
GBC = GradientBoostingClassifier(random_state=0)
GBC.fit(xv_train, y_train)

In [40]:
joblib.dump(GBC, 'GradientBoosting_Classifier.joblib')

['GradientBoosting_Classifier.joblib']

# RANDOM FOREST CLASSIFIER

In [41]:
from sklearn.ensemble import RandomForestClassifier

In [42]:
RFC = RandomForestClassifier(random_state=0)
RFC.fit(xv_train, y_train)

In [43]:
joblib.dump(RFC, 'RandomForest_Classifier.joblib')

['RandomForest_Classifier.joblib']