# Machine Learning Model (Naive Bayes Classifier)

In [6]:
import os

In [7]:
os.getcwd()

'C:\\Users\\Dell'

In [8]:
os.chdir('C:\\Users\\Dell')

In [9]:
os.getcwd()

'C:\\Users\\Dell'

In [10]:
import numpy as np
import pandas as pd

In [13]:
trump=pd.read_csv("Trump_Trudeau.csv",index_col=0)
trump.head()

Unnamed: 0,id,author,status
0,1,Donald J. Trump,I will be making a major statement from the @W...
1,2,Donald J. Trump,Just arrived at #ASEAN50 in the Philippines fo...
2,3,Donald J. Trump,"After my tour of Asia, all Countries dealing w..."
3,4,Donald J. Trump,Great to see @RandPaul looking well and back o...
4,5,Donald J. Trump,Excited to be heading home to see the House pa...


In [14]:
trump=trump.drop(columns=["id"])                    #Remove Unwanted Columns

In [15]:
trump.head()

Unnamed: 0,author,status
0,Donald J. Trump,I will be making a major statement from the @W...
1,Donald J. Trump,Just arrived at #ASEAN50 in the Philippines fo...
2,Donald J. Trump,"After my tour of Asia, all Countries dealing w..."
3,Donald J. Trump,Great to see @RandPaul looking well and back o...
4,Donald J. Trump,Excited to be heading home to see the House pa...


# Step 1

In [16]:
X=trump["status"]                                    #Feature Variables

In [17]:
X.head()

0    I will be making a major statement from the @W...
1    Just arrived at #ASEAN50 in the Philippines fo...
2    After my tour of Asia, all Countries dealing w...
3    Great to see @RandPaul looking well and back o...
4    Excited to be heading home to see the House pa...
Name: status, dtype: object

In [18]:
y=trump["author"]                                   #Target Variable

In [19]:
y.head()

0    Donald J. Trump
1    Donald J. Trump
2    Donald J. Trump
3    Donald J. Trump
4    Donald J. Trump
Name: author, dtype: object

# Rules

In [20]:
X.dtype

dtype('O')

In [21]:
type(X)

pandas.core.series.Series

In [22]:
X.shape

(400,)

In [23]:
trump.isna().sum()

author    0
status    0
dtype: int64

# Step 2

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=43,stratify=y)

# If there are 2 or more Features, then only we do Feature Scaling.

# Since, Feature Variable has dtype as 'object' we needed to convert it to Numeric

# Since, the Features were Text Features and not Categorical Features, we used Count Vectorizer to convert it.

# Using Count Vectorizer

In [33]:
from sklearn.feature_extraction.text import CountVectorizer

In [34]:
cv=CountVectorizer(min_df=0.05,max_df=0.9,stop_words="english")

In [38]:
X_train_cv=cv.fit_transform(X_train)                                  #It is a Sparse array

In [39]:
X_train_cv=pd.DataFrame(X_train_cv.toarray(),columns=cv.get_feature_names())

In [40]:
X_train_cv.head()

Unnamed: 0,amp,au,canada,des,du,et,great,https,just,la,le,les,nous,pour,rt,today,trade,vietnam
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
3,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


# Check The Rules on Train Dataset of Features

In [41]:
X_train_cv.dtypes

amp        int64
au         int64
canada     int64
des        int64
du         int64
et         int64
great      int64
https      int64
just       int64
la         int64
le         int64
les        int64
nous       int64
pour       int64
rt         int64
today      int64
trade      int64
vietnam    int64
dtype: object

In [42]:
type(X_train_cv)

pandas.core.frame.DataFrame

In [43]:
X_train_cv.shape

(280, 18)

In [45]:
X_train_cv.isna().sum()

amp        0
au         0
canada     0
des        0
du         0
et         0
great      0
https      0
just       0
la         0
le         0
les        0
nous       0
pour       0
rt         0
today      0
trade      0
vietnam    0
dtype: int64

In [46]:
X_test_cv=cv.transform(X_test)                                  #It is a Sparse array

In [47]:
X_test_cv=pd.DataFrame(X_test_cv.toarray(),columns=cv.get_feature_names())

In [48]:
X_test_cv.head()

Unnamed: 0,amp,au,canada,des,du,et,great,https,just,la,le,les,nous,pour,rt,today,trade,vietnam
0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0
2,0,1,1,0,2,1,0,1,0,1,0,0,2,1,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


# Check The Rules on Test Dataset of Features

In [49]:
X_test_cv.dtypes

amp        int64
au         int64
canada     int64
des        int64
du         int64
et         int64
great      int64
https      int64
just       int64
la         int64
le         int64
les        int64
nous       int64
pour       int64
rt         int64
today      int64
trade      int64
vietnam    int64
dtype: object

In [50]:
type(X_test_cv)

pandas.core.frame.DataFrame

In [51]:
X_test_cv.shape

(120, 18)

In [52]:
X_test_cv.isna().sum()

amp        0
au         0
canada     0
des        0
du         0
et         0
great      0
https      0
just       0
la         0
le         0
les        0
nous       0
pour       0
rt         0
today      0
trade      0
vietnam    0
dtype: int64

# Step 3

In [53]:
from sklearn.naive_bayes import MultinomialNB

In [54]:
naive=MultinomialNB()

In [55]:
naive.fit(X_train_cv,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

# Step 4

# Evaluate

In [93]:
naive.score(X_test_cv,y_test)

0.825

# Predict

In [94]:
naive.predict(X_test_cv)

array(['Justin Trudeau', 'Donald J. Trump', 'Justin Trudeau',
       'Donald J. Trump', 'Donald J. Trump', 'Donald J. Trump',
       'Justin Trudeau', 'Donald J. Trump', 'Donald J. Trump',
       'Donald J. Trump', 'Donald J. Trump', 'Donald J. Trump',
       'Donald J. Trump', 'Justin Trudeau', 'Donald J. Trump',
       'Donald J. Trump', 'Justin Trudeau', 'Donald J. Trump',
       'Donald J. Trump', 'Donald J. Trump', 'Donald J. Trump',
       'Justin Trudeau', 'Justin Trudeau', 'Donald J. Trump',
       'Donald J. Trump', 'Donald J. Trump', 'Justin Trudeau',
       'Justin Trudeau', 'Donald J. Trump', 'Donald J. Trump',
       'Donald J. Trump', 'Justin Trudeau', 'Donald J. Trump',
       'Donald J. Trump', 'Justin Trudeau', 'Justin Trudeau',
       'Justin Trudeau', 'Donald J. Trump', 'Donald J. Trump',
       'Donald J. Trump', 'Donald J. Trump', 'Donald J. Trump',
       'Justin Trudeau', 'Justin Trudeau', 'Donald J. Trump',
       'Donald J. Trump', 'Donald J. Trump', 'Donald J.

# Prediction On New Observations

In [57]:
statement="Fake News"
tweet=cv.transform([statement])

In [58]:
naive.predict(tweet)

array(['Donald J. Trump'], dtype='<U15')

In [59]:
statement1="Canada"
tweet=cv.transform([statement1])

In [60]:
naive.predict(tweet)

array(['Justin Trudeau'], dtype='<U15')

In [61]:
statement2="U.S. planning to cut military presence in Iraq"
tweet=cv.transform([statement2])

In [62]:
naive.predict(tweet)

array(['Donald J. Trump'], dtype='<U15')

In [63]:
statement3="From fighting climate change, to defending human rights, to supporting refugees and vulnerable populations, Canada’s priorities are clear - and I know @BobRae48 is going to be an incredible advocate for them at the @UN. Thanks for stopping by today, Bob."
tweet=cv.transform([statement3])

In [64]:
naive.predict(tweet)

array(['Justin Trudeau'], dtype='<U15')

# Using Tfidf Vectorizer

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [71]:
tfidf=TfidfVectorizer(min_df=0.05,max_df=0.9,stop_words="english")

In [72]:
X_train_tfidf=tfidf.fit_transform(X_train)                              #This is a Sparse array

In [73]:
X_train_tfidf=pd.DataFrame(X_train_tfidf.toarray(),columns=tfidf.get_feature_names())

In [74]:
X_train_tfidf.head()

Unnamed: 0,amp,au,canada,des,du,et,great,https,just,la,le,les,nous,pour,rt,today,trade,vietnam
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.445929,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.638126,0.627648
3,0.874088,0.0,0.0,0.0,0.0,0.0,0.0,0.485768,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [76]:
X_test_tfidf=tfidf.transform(X_test)                              #This is a Sparse array

In [77]:
X_test_tfidf=pd.DataFrame(X_test_tfidf.toarray(),columns=tfidf.get_feature_names())

In [78]:
X_test_tfidf.head()

Unnamed: 0,amp,au,canada,des,du,et,great,https,just,la,le,les,nous,pour,rt,today,trade,vietnam
0,0.0,0.0,0.926143,0.0,0.0,0.0,0.0,0.377172,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.593849,0.29127,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.750004,0.0
2,0.0,0.288937,0.280134,0.0,0.577875,0.249828,0.0,0.114085,0.0,0.241953,0.0,0.0,0.560268,0.23713,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Step 3

In [79]:
from sklearn.svm import SVC

In [80]:
svm=SVC(kernel="rbf",degree=2)

In [81]:
svm.fit(X_train_tfidf,y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=2, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

# Step 4

# Evaluate

In [95]:
svm.score(X_test_tfidf,y_test)

0.825

# Predict

In [96]:
svm.predict(X_test_tfidf)

array(['Justin Trudeau', 'Donald J. Trump', 'Justin Trudeau',
       'Donald J. Trump', 'Donald J. Trump', 'Donald J. Trump',
       'Justin Trudeau', 'Donald J. Trump', 'Donald J. Trump',
       'Donald J. Trump', 'Justin Trudeau', 'Justin Trudeau',
       'Donald J. Trump', 'Justin Trudeau', 'Donald J. Trump',
       'Donald J. Trump', 'Justin Trudeau', 'Donald J. Trump',
       'Donald J. Trump', 'Donald J. Trump', 'Donald J. Trump',
       'Justin Trudeau', 'Justin Trudeau', 'Donald J. Trump',
       'Donald J. Trump', 'Donald J. Trump', 'Justin Trudeau',
       'Justin Trudeau', 'Donald J. Trump', 'Donald J. Trump',
       'Donald J. Trump', 'Justin Trudeau', 'Donald J. Trump',
       'Donald J. Trump', 'Justin Trudeau', 'Justin Trudeau',
       'Justin Trudeau', 'Donald J. Trump', 'Donald J. Trump',
       'Donald J. Trump', 'Donald J. Trump', 'Donald J. Trump',
       'Justin Trudeau', 'Justin Trudeau', 'Donald J. Trump',
       'Donald J. Trump', 'Donald J. Trump', 'Donald J. T

# Prediction On New Observations

In [97]:
statement="Fake News"
tweet=tfidf.transform([statement])                               #This is Sparse array, so need to convert it to DataFrame
tweet=pd.DataFrame(tweet.toarray(),columns=tfidf.get_feature_names())

In [98]:
svm.predict(tweet)

array(['Donald J. Trump'], dtype=object)

In [99]:
statement1="From fighting climate change, to defending human rights, to supporting refugees and vulnerable populations, Canada’s priorities are clear - and I know @BobRae48 is going to be an incredible advocate for them at the @UN. Thanks for stopping by today, Bob."
tweet=tfidf.transform([statement1])                               #This is Sparse array, so need to convert it to DataFrame
tweet=pd.DataFrame(tweet.toarray(),columns=tfidf.get_feature_names())

In [100]:
svm.predict(tweet)

array(['Justin Trudeau'], dtype=object)