In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def get_data(path):
    return pd.read_csv('./dataset/'+path+'.csv', index_col='id')

train_df = get_data('train')
test_df = get_data('test')

In [3]:
def diagnose_data(df):
    print(f'shape:\n{df.shape}\n')
    print(f'columns:\n{sorted(df.columns)}\n')
    print(f'n dtypes:\n{df.dtypes.value_counts()}\n')
    print(f'n uniques:')
    for col in df.columns:
        print(f'{col}: {df[col].nunique()}')
    print(f'\nunique values:')
    for col in df.columns:
        print(f'{col}: {df[col].unique()}')
    print(f'\nnull values:')
    for col in df.columns:
        print(f'{col}: {df[col].isnull().sum()}')
    print('\ninfo:')
    df.info()
    return df.head()


In [4]:
diagnose_data(train_df)

shape:
(20800, 4)

columns:
['author', 'label', 'text', 'title']

n dtypes:
object    3
int64     1
dtype: int64

n uniques:
title: 19803
author: 4201
text: 20386
label: 2

unique values:
title: ['House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It'
 'FLYNN: Hillary Clinton, Big Woman on Campus - Breitbart'
 'Why the Truth Might Get You Fired' ...
 'N.F.L. Playoffs: Schedule, Matchups and Odds - The New York Times'
 'Macy’s Is Said to Receive Takeover Approach by Hudson’s Bay - The New York Times'
 'NATO, Russia To Hold Parallel Exercises In Balkans']
author: ['Darrell Lucus' 'Daniel J. Flynn' 'Consortiumnews.com' ... 'D. Samuelson'
 'Judge Andrew Napolitano' 'Michael J. de la Merced and Rachel Abrams']
text: ['House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It By Darrell Lucus on October 30, 2016 Subscribe Jason Chaffetz on the stump in American Fork, Utah ( image courtesy Michael Jolley, available under a Creative Commons-B

Unnamed: 0_level_0,title,author,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [5]:
diagnose_data(test_df)

shape:
(5200, 3)

columns:
['author', 'text', 'title']

n dtypes:
object    3
dtype: int64

n uniques:
title: 5056
author: 1732
text: 5126

unique values:
title: ['Specter of Trump Loosens Tongues, if Not Purse Strings, in Silicon Valley - The New York Times'
 'Russian warships ready to strike terrorists near Aleppo'
 '#NoDAPL: Native American Leaders Vow to Stay All Winter, File Lawsuit Against Police'
 ...
 'California Today: What, Exactly, Is in Your Sushi? - The New York Times'
 '300 US Marines To Be Deployed To Russian Border In Norway'
 'Awkward Sex, Onscreen and Off - The New York Times']
author: ['David Streitfeld' nan 'Common Dreams' ... 'Jody Rosen'
 'Sheryl Gay Stolberg' 'Teddy Wayne']
text: ['PALO ALTO, Calif.  —   After years of scorning the political process, Silicon Valley has leapt into the fray. The prospect of a President Donald J. Trump is pushing the tech community to move beyond its traditional role as donors and to embrace a new existence as agitators and activist

Unnamed: 0_level_0,title,author,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [6]:
def null_df(df):
    percent_null = (df.isnull().sum() / df.isnull().count())
    null_df = pd.concat([df.isnull().sum(), percent_null], axis='columns', keys=['total', 'percent']).sort_values(by='percent', ascending=False)
    return null_df[null_df['percent'] > 0]

In [7]:
null_df(test_df)

Unnamed: 0,total,percent
author,503,0.096731
title,122,0.023462
text,7,0.001346


In [8]:
null_df(train_df)

Unnamed: 0,total,percent
author,1957,0.094087
title,558,0.026827
text,39,0.001875


In [9]:
train_df = train_df.fillna('')
test_df = test_df.fillna('')

In [10]:
diagnose_data(train_df)

shape:
(20800, 4)

columns:
['author', 'label', 'text', 'title']

n dtypes:
object    3
int64     1
dtype: int64

n uniques:
title: 19804
author: 4202
text: 20387
label: 2

unique values:
title: ['House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It'
 'FLYNN: Hillary Clinton, Big Woman on Campus - Breitbart'
 'Why the Truth Might Get You Fired' ...
 'N.F.L. Playoffs: Schedule, Matchups and Odds - The New York Times'
 'Macy’s Is Said to Receive Takeover Approach by Hudson’s Bay - The New York Times'
 'NATO, Russia To Hold Parallel Exercises In Balkans']
author: ['Darrell Lucus' 'Daniel J. Flynn' 'Consortiumnews.com' ... 'D. Samuelson'
 'Judge Andrew Napolitano' 'Michael J. de la Merced and Rachel Abrams']
text: ['House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It By Darrell Lucus on October 30, 2016 Subscribe Jason Chaffetz on the stump in American Fork, Utah ( image courtesy Michael Jolley, available under a Creative Commons-B

Unnamed: 0_level_0,title,author,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [11]:
diagnose_data(test_df)

shape:
(5200, 3)

columns:
['author', 'text', 'title']

n dtypes:
object    3
dtype: int64

n uniques:
title: 5057
author: 1733
text: 5127

unique values:
title: ['Specter of Trump Loosens Tongues, if Not Purse Strings, in Silicon Valley - The New York Times'
 'Russian warships ready to strike terrorists near Aleppo'
 '#NoDAPL: Native American Leaders Vow to Stay All Winter, File Lawsuit Against Police'
 ...
 'California Today: What, Exactly, Is in Your Sushi? - The New York Times'
 '300 US Marines To Be Deployed To Russian Border In Norway'
 'Awkward Sex, Onscreen and Off - The New York Times']
author: ['David Streitfeld' '' 'Common Dreams' ... 'Jody Rosen'
 'Sheryl Gay Stolberg' 'Teddy Wayne']
text: ['PALO ALTO, Calif.  —   After years of scorning the political process, Silicon Valley has leapt into the fray. The prospect of a President Donald J. Trump is pushing the tech community to move beyond its traditional role as donors and to embrace a new existence as agitators and activists

author: 0
text: 0

info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5200 entries, 20800 to 25999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   5200 non-null   object
 1   author  5200 non-null   object
 2   text    5200 non-null   object
dtypes: object(3)
memory usage: 162.5+ KB


Unnamed: 0_level_0,title,author,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [12]:
X_train = train_df['text']
y_train = train_df['label']

In [13]:
X_train.head()

id
0    House Dem Aide: We Didn’t Even See Comey’s Let...
1    Ever get the feeling your life circles the rou...
2    Why the Truth Might Get You Fired October 29, ...
3    Videos 15 Civilians Killed In Single US Airstr...
4    Print \nAn Iranian woman has been sentenced to...
Name: text, dtype: object

In [14]:
y_train.head()

id
0    1
1    0
2    1
3    1
4    1
Name: label, dtype: int64

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')

X_train = vectorizer.fit_transform(X_train)

In [16]:
print(X_train)

  (0, 153220)	0.05082122151984787
  (0, 87905)	0.04820801808277606
  (0, 22935)	0.023534978679583214
  (0, 28493)	0.026023232419584163
  (0, 49697)	0.019191596318298965
  (0, 31058)	0.06240470141167516
  (0, 35663)	0.05130677438415905
  (0, 52957)	0.016173066016037754
  (0, 94638)	0.04164560141349835
  (0, 85907)	0.02804869654197643
  (0, 153161)	0.012336509156248378
  (0, 4761)	0.036714945605664684
  (0, 134184)	0.03064091700273943
  (0, 68587)	0.024060916390673256
  (0, 126550)	0.029005700704916117
  (0, 121918)	0.031986171345421824
  (0, 130933)	0.021272419802078092
  (0, 38314)	0.026597488004304785
  (0, 81067)	0.023036654921286724
  (0, 142986)	0.04196216918918574
  (0, 27464)	0.05217235281487102
  (0, 26363)	0.03964872784255712
  (0, 95898)	0.03281415191706605
  (0, 141947)	0.024996020990469162
  (0, 133074)	0.03246475560195631
  :	:
  (20799, 96910)	0.015296090848059616
  (20799, 144104)	0.028810526983595167
  (20799, 87332)	0.016376054004725978
  (20799, 106087)	0.0183249017905

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import PassiveAggressiveClassifier

model = PassiveAggressiveClassifier(max_iter=50)
model.fit(X_train, y_train)

PassiveAggressiveClassifier(max_iter=50)

In [18]:
X_test = test_df['text']
X_test.head()

id
20800    PALO ALTO, Calif.  —   After years of scorning...
20801    Russian warships ready to strike terrorists ne...
20802    Videos #NoDAPL: Native American Leaders Vow to...
20803    If at first you don’t succeed, try a different...
20804    42 mins ago 1 Views 0 Comments 0 Likes 'For th...
Name: text, dtype: object

In [19]:
X_test = vectorizer.transform(X_test)

In [20]:
print(X_test)

  (0, 153527)	0.013190699766823128
  (0, 153171)	0.009940320725012915
  (0, 152302)	0.014817853799458687
  (0, 152052)	0.022091096545482337
  (0, 152033)	0.02413128836890018
  (0, 152012)	0.011026585480784237
  (0, 151996)	0.018753971792502086
  (0, 151993)	0.029485226122131936
  (0, 151967)	0.018955184543142295
  (0, 150897)	0.01824459294097078
  (0, 150868)	0.03330056111275929
  (0, 150140)	0.04483073962236792
  (0, 149831)	0.031991760417709354
  (0, 149762)	0.01839347372715725
  (0, 149588)	0.01763763779906196
  (0, 149365)	0.026881073510720013
  (0, 149147)	0.03282698398395805
  (0, 149146)	0.012068256239299338
  (0, 148993)	0.017500641400938072
  (0, 148967)	0.021577073732069883
  (0, 148024)	0.05631309438479222
  (0, 147694)	0.05885660484134696
  (0, 147266)	0.036187436638874686
  (0, 147070)	0.02457248169663963
  (0, 146492)	0.03207368596333743
  :	:
  (5199, 9377)	0.044787168263776406
  (5199, 8934)	0.03428347466413076
  (5199, 8118)	0.03203977522973344
  (5199, 8113)	0.0388910

In [21]:
y_pred = model.predict(X_test)
y_pred

array([0, 1, 1, ..., 0, 1, 0], dtype=int64)

In [22]:
y_test = get_data('submit')['label']
y_test

id
20800    0
20801    1
20802    0
20803    1
20804    1
        ..
25995    0
25996    1
25997    0
25998    1
25999    0
Name: label, Length: 5200, dtype: int64

In [23]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.59      0.65      0.62      2339
           1       0.69      0.62      0.65      2861

    accuracy                           0.64      5200
   macro avg       0.64      0.64      0.64      5200
weighted avg       0.64      0.64      0.64      5200



In [24]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

In [25]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Angga\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [27]:
diagnose_data(train_df)

shape:
(20800, 4)

columns:
['author', 'label', 'text', 'title']

n dtypes:
object    3
int64     1
dtype: int64

n uniques:
title: 19804
author: 4202
text: 20387
label: 2

unique values:
title: ['House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It'
 'FLYNN: Hillary Clinton, Big Woman on Campus - Breitbart'
 'Why the Truth Might Get You Fired' ...
 'N.F.L. Playoffs: Schedule, Matchups and Odds - The New York Times'
 'Macy’s Is Said to Receive Takeover Approach by Hudson’s Bay - The New York Times'
 'NATO, Russia To Hold Parallel Exercises In Balkans']
author: ['Darrell Lucus' 'Daniel J. Flynn' 'Consortiumnews.com' ... 'D. Samuelson'
 'Judge Andrew Napolitano' 'Michael J. de la Merced and Rachel Abrams']
text: ['House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It By Darrell Lucus on October 30, 2016 Subscribe Jason Chaffetz on the stump in American Fork, Utah ( image courtesy Michael Jolley, available under a Creative Commons-B

Unnamed: 0_level_0,title,author,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [28]:
diagnose_data(test_df)

shape:
(5200, 3)

columns:
['author', 'text', 'title']

n dtypes:
object    3
dtype: int64

n uniques:
title: 5057
author: 1733
text: 5127

unique values:
title: ['Specter of Trump Loosens Tongues, if Not Purse Strings, in Silicon Valley - The New York Times'
 'Russian warships ready to strike terrorists near Aleppo'
 '#NoDAPL: Native American Leaders Vow to Stay All Winter, File Lawsuit Against Police'
 ...
 'California Today: What, Exactly, Is in Your Sushi? - The New York Times'
 '300 US Marines To Be Deployed To Russian Border In Norway'
 'Awkward Sex, Onscreen and Off - The New York Times']
author: ['David Streitfeld' '' 'Common Dreams' ... 'Jody Rosen'
 'Sheryl Gay Stolberg' 'Teddy Wayne']
text: ['PALO ALTO, Calif.  —   After years of scorning the political process, Silicon Valley has leapt into the fray. The prospect of a President Donald J. Trump is pushing the tech community to move beyond its traditional role as donors and to embrace a new existence as agitators and activists

Unnamed: 0_level_0,title,author,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [89]:
port_stem = PorterStemmer()

def stemming(content):
    review = re.sub(r'[^a-z]', ' ', content, flags=re.I).split()
    return ' '.join(port_stem.stem(word) for word in review if word not in stopwords.words('english'))

In [92]:
train_df['text'] = train_df['text'].apply(stemming)

In [116]:
X_train = train_df['text']
y_train = train_df['label']

In [119]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)

In [120]:
print(X_train)

  (0, 109030)	0.052098136868504494
  (0, 61111)	0.04941927504152846
  (0, 14101)	0.022488225097998308
  (0, 18204)	0.025767969485352808
  (0, 31966)	0.019665996454485433
  (0, 19703)	0.04124399606755635
  (0, 22764)	0.05259588955577312
  (0, 34252)	0.01303309158834434
  (0, 66199)	0.0417183348901771
  (0, 109370)	0.015109752968400893
  (0, 59519)	0.028310223087444738
  (0, 97395)	0.015345213190525609
  (0, 520)	0.024376173290678718
  (0, 94412)	0.02408915220616975
  (0, 45916)	0.01865600775451291
  (0, 89097)	0.028237611550602983
  (0, 85586)	0.029016637227115345
  (0, 92160)	0.01787247394464731
  (0, 24461)	0.025339393379859715
  (0, 42630)	0.01853784663564838
  (0, 55721)	0.020713584207967786
  (0, 101040)	0.040348747964904785
  (0, 17372)	0.049974370052961396
  (0, 16499)	0.04064492721799122
  (0, 67357)	0.03253770465085529
  :	:
  (20799, 38851)	0.022251118042736554
  (20799, 45724)	0.008248614819770888
  (20799, 100383)	0.012531796778056603
  (20799, 65981)	0.050605681587952284
  

In [121]:
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [123]:
test_df['text'] = test_df['text'].apply(stemming)

In [126]:
X_test = test_df['text']
X_test

id
20800    palo alto calif after year scorn polit process...
20801    russian warship readi strike terrorist near al...
20802    video nodapl nativ american leader vow stay al...
20803    if first succeed tri differ sport tim tebow he...
20804    min ago view comment like for first time histo...
                               ...                        
25995    of dysfunct plagu world megac none may pernici...
25996    washington gov john kasich ohio tuesday sign l...
25997    good morn want get california today email here...
25998    previou next us marin to be deploy to russian ...
25999    perhap seen new tv seri whose pilot episod beg...
Name: text, Length: 5200, dtype: object

In [127]:
X_test = vectorizer.transform(X_test)
print(X_test)

  (0, 109370)	0.012380635321957812
  (0, 109330)	0.013114672950106489
  (0, 109114)	0.013521733836247989
  (0, 108977)	0.008295579624412498
  (0, 108147)	0.014737762078934642
  (0, 108020)	0.03288519799030707
  (0, 107983)	0.01807352442760504
  (0, 107971)	0.024000856767260423
  (0, 107952)	0.010948409318615142
  (0, 107942)	0.028610002890245546
  (0, 107923)	0.017928592830546475
  (0, 107915)	0.00983230935238605
  (0, 107505)	0.015058058142278894
  (0, 107077)	0.018145979413082106
  (0, 107050)	0.027938892441689563
  (0, 106940)	0.015284557197738512
  (0, 106741)	0.01296099636109381
  (0, 106699)	0.019191794720526543
  (0, 106466)	0.04458842578142207
  (0, 106378)	0.010898435272784068
  (0, 106217)	0.02236802455368037
  (0, 106180)	0.01780247774831192
  (0, 106095)	0.029530478984735826
  (0, 106017)	0.010248543574604892
  (0, 105869)	0.026735779093853412
  :	:
  (5199, 3946)	0.04426126829468128
  (5199, 3761)	0.02066975146430898
  (5199, 3610)	0.030939379482725994
  (5199, 3199)	0.052

In [132]:
submit_df = get_data('submit')
y_test = submit_df['label']

In [134]:
y_test

id
20800    0
20801    1
20802    0
20803    1
20804    1
        ..
25995    0
25996    1
25997    0
25998    1
25999    0
Name: label, Length: 5200, dtype: int64

In [137]:
y_pred = model.predict(X_test)
for i in y_pred:
    print(i)

0
1
1
0
1
1
0
1
1
1
1
1
1
1
1
0
0
0
1
0
1
1
1
1
1
0
1
0
0
1
0
1
0
0
1
0
0
1
1
0
0
0
1
0
0
1
0
0
1
1
0
1
0
0
1
0
1
0
0
0
1
1
1
0
1
1
1
1
1
0
1
1
1
0
0
0
0
0
1
0
0
1
0
1
1
1
1
0
0
0
1
0
0
1
1
0
0
0
1
0
0
1
0
0
0
0
1
0
1
1
1
1
0
0
0
0
0
1
0
1
0
0
1
0
0
1
1
0
1
0
1
0
1
0
0
0
0
1
1
1
1
0
1
1
1
0
1
0
0
0
1
0
0
1
0
1
0
1
1
1
1
1
1
1
0
1
1
1
1
0
0
0
0
1
0
0
0
0
0
0
0
0
0
1
0
1
1
0
1
0
1
0
0
0
1
1
1
0
1
0
0
1
0
1
1
1
1
1
0
1
0
1
1
0
1
1
1
0
0
1
0
1
0
0
0
1
1
1
0
0
0
0
1
1
1
1
1
1
1
0
1
1
0
1
1
0
1
1
0
1
1
1
1
0
1
0
0
1
1
1
0
1
1
0
1
1
1
1
0
1
0
1
1
1
0
0
0
0
0
1
1
1
0
1
1
0
0
1
0
0
0
1
0
1
1
1
1
1
1
0
0
0
1
1
1
0
1
1
0
0
0
1
1
0
0
1
0
1
0
1
0
0
0
1
0
1
1
0
0
1
1
0
1
1
0
1
1
0
0
0
1
0
1
0
1
0
1
1
0
1
1
0
1
1
0
1
0
1
0
0
1
1
1
1
1
0
1
1
0
0
0
1
0
0
1
0
1
1
0
0
0
1
1
0
1
1
1
1
0
0
1
1
1
1
0
0
0
0
1
0
0
0
0
0
1
0
0
0
0
0
1
0
1
0
0
0
0
0
0
0
0
0
1
0
1
1
0
1
0
0
0
1
0
1
0
0
0
1
1
0
1
0
1
0
0
1
0
1
0
0
1
1
1
1
0
1
0
0
1
1
0
0
0
0
0
0
0
0
0
0
1
0
0
0
1
0
1
1
0
1
1
0
1
0
0
0
1
1
1
0
0
0
0
1
0
0
0
1
1
1


1
1
1
0
1
0
0
0
0
1
0
1
0
0
1
1
0
1
0
1
0
1
1
1
0
1
0
0
1
0
1
0
1
1
1
1
0
0
1
1
0
1
0
0
1
1
1
1
0
1
1
1
0
0
0
1
1
0
0
1
1
1
1
1
0
0
1
1
0
1
1
1
1
0
1
1
1
1
0
0
1
0
0
1
1
0
1
0
1
0
1
0
0
0
0
0
1
0
1
1
1
0
1
0
0
1
0
1
1
0
1
1
0
1
0
1
0
0
0
0
1
0
1
1
0
1
1
0
0
1
1
0
1
1
1
0
1
1
0
1
1
1
0
1
0
1
0
1
1
0
0
1
0
0
0
1
0
0
0
1
0
1
1
0
0
1
0
1
0
0
0
0
1
1
0
1
1
0
1
1
0
0
0
0
1
0
1
1
1
1
1
1
0
0
1
1
1
1
1
1
0
0
0
1
1
1
0
1
0
1
0
0
0
1
0
0
0
1
0
1
1
0
1
0
0
0
1
1
0
0
1
0
0
0
0
0
1
1
0
0
0
0
1
0
0
1
1
0
0
1
1
1
1
0
0
0
0
0
1
1
0
1
0
1
1
0
1
1
0
0
1
0
1
1
1
1
1
0
1
1
0
1
1
0
1
0
0
0
1
1
1
0
0
0
1
1
1
0
1
0
0
1
0
1
1
0
0
1
0
0
0
1
0
0
1
1
1
1
0
1
1
1
0
0
0
1
1
1
1
0
0
1
1
1
0
0
1
0
0
0
0
0
0
0
1
0
1
1
0
0
0
1
0
1
0
1
0
1
0
0
0
1
0
1
1
0
1
0
0
0
0
1
1
0
1
1
0
0
1
1
0
1
1
1
1
1
1
0
1
1
1
0
0
0
1
1
0
0
0
0
1
0
0
0
0
0
0
1
1
1
1
0
0
1
1
0
0
1
0
0
0
1
0
0
0
1
0
1
0
0
1
1
0
1
0
1
0
1
1
1
0
0
0
1
0
0
0
0
0
1
0
1
1
0
1
1
1
1
0
1
1
0
1
1
1
0
1
0
0
1
0
0
0
1
0
0
1
1
0
1
1
1
1
1
1
1
1
1
0
1
0
1
0
1
0
0
1
0
1
1


In [136]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.59      0.65      0.62      2339
           1       0.69      0.63      0.66      2861

    accuracy                           0.64      5200
   macro avg       0.64      0.64      0.64      5200
weighted avg       0.64      0.64      0.64      5200



In [149]:
id = pd.Series(test_df.index, name='id')
label_pred = pd.Series(y_pred, name='label')

sub_df = pd.concat([id, label_pred], axis='columns').set_index('id')
sub_df.to_csv('./dataset/submission_fake_news.csv')