# Import Libraries

In [110]:
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup           
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import collections
import itertools
import pandas as pd
import re
import math
import numpy as np

# Load CSV

In [111]:
# Read csv
data = pd.read_csv('./datasets/Combined_News_DJIA.csv')

In [112]:
# Display data
data.head()

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2008-08-08,0,"b""Georgia 'downs two Russian warplanes' as cou...",b'BREAKING: Musharraf to be impeached.',b'Russia Today: Columns of troops roll into So...,b'Russian tanks are moving towards the capital...,"b""Afghan children raped with 'impunity,' U.N. ...",b'150 Russian tanks have entered South Ossetia...,"b""Breaking: Georgia invades South Ossetia, Rus...","b""The 'enemy combatent' trials are nothing but...",...,b'Georgia Invades South Ossetia - if Russia ge...,b'Al-Qaeda Faces Islamist Backlash',"b'Condoleezza Rice: ""The US would not act to p...",b'This is a busy day: The European Union has ...,"b""Georgia will withdraw 1,000 soldiers from Ir...",b'Why the Pentagon Thinks Attacking Iran is a ...,b'Caucasus in crisis: Georgia invades South Os...,b'Indian shoe manufactory - And again in a se...,b'Visitors Suffering from Mental Illnesses Ban...,"b""No Help for Mexico's Kidnapping Surge"""
1,2008-08-11,1,b'Why wont America and Nato help us? If they w...,b'Bush puts foot down on Georgian conflict',"b""Jewish Georgian minister: Thanks to Israeli ...",b'Georgian army flees in disarray as Russians ...,"b""Olympic opening ceremony fireworks 'faked'""",b'What were the Mossad with fraudulent New Zea...,b'Russia angered by Israeli military sale to G...,b'An American citizen living in S.Ossetia blam...,...,b'Israel and the US behind the Georgian aggres...,"b'""Do not believe TV, neither Russian nor Geor...",b'Riots are still going on in Montreal (Canada...,b'China to overtake US as largest manufacturer',b'War in South Ossetia [PICS]',b'Israeli Physicians Group Condemns State Tort...,b' Russia has just beaten the United States ov...,b'Perhaps *the* question about the Georgia - R...,b'Russia is so much better at war',"b""So this is what it's come to: trading sex fo..."
2,2008-08-12,0,b'Remember that adorable 9-year-old who sang a...,"b""Russia 'ends Georgia operation'""","b'""If we had no sexual harassment we would hav...","b""Al-Qa'eda is losing support in Iraq because ...",b'Ceasefire in Georgia: Putin Outmaneuvers the...,b'Why Microsoft and Intel tried to kill the XO...,b'Stratfor: The Russo-Georgian War and the Bal...,"b""I'm Trying to Get a Sense of This Whole Geor...",...,b'U.S. troops still in Georgia (did you know t...,b'Why Russias response to Georgia was right',"b'Gorbachev accuses U.S. of making a ""serious ...","b'Russia, Georgia, and NATO: Cold War Two'",b'Remember that adorable 62-year-old who led y...,b'War in Georgia: The Israeli connection',b'All signs point to the US encouraging Georgi...,b'Christopher King argues that the US and NATO...,b'America: The New Mexico?',"b""BBC NEWS | Asia-Pacific | Extinction 'by man..."
3,2008-08-13,0,b' U.S. refuses Israel weapons to attack Iran:...,"b""When the president ordered to attack Tskhinv...",b' Israel clears troops who killed Reuters cam...,b'Britain\'s policy of being tough on drugs is...,b'Body of 14 year old found in trunk; Latest (...,b'China has moved 10 *million* quake survivors...,"b""Bush announces Operation Get All Up In Russi...",b'Russian forces sink Georgian ships ',...,b'Elephants extinct by 2020?',b'US humanitarian missions soon in Georgia - i...,"b""Georgia's DDOS came from US sources""","b'Russian convoy heads into Georgia, violating...",b'Israeli defence minister: US against strike ...,b'Gorbachev: We Had No Choice',b'Witness: Russian forces head towards Tbilisi...,b' Quarter of Russians blame U.S. for conflict...,b'Georgian president says US military will ta...,b'2006: Nobel laureate Aleksander Solzhenitsyn...
4,2008-08-14,1,b'All the experts admit that we should legalis...,b'War in South Osetia - 89 pictures made by a ...,b'Swedish wrestler Ara Abrahamian throws away ...,b'Russia exaggerated the death toll in South O...,b'Missile That Killed 9 Inside Pakistan May Ha...,"b""Rushdie Condemns Random House's Refusal to P...",b'Poland and US agree to missle defense deal. ...,"b'Will the Russians conquer Tblisi? Bet on it,...",...,b'Bank analyst forecast Georgian crisis 2 days...,"b""Georgia confict could set back Russia's US r...",b'War in the Caucasus is as much the product o...,"b'""Non-media"" photos of South Ossetia/Georgia ...",b'Georgian TV reporter shot by Russian sniper ...,b'Saudi Arabia: Mother moves to block child ma...,b'Taliban wages war on humanitarian aid workers',"b'Russia: World ""can forget about"" Georgia\'s...",b'Darfur rebels accuse Sudan of mounting major...,b'Philippines : Peace Advocate say Muslims nee...


In [113]:
# Display dataframe shape
data.shape
# 27 columns: 1 target, 26 features
# 1989 observations

(1989, 27)

# Split to X and y

In [116]:
# Split daata into features X and target y
y = data[['Label']]
X = data.loc[:,data.columns!= 'Label']

In [6]:
# Display top 5 rows of X
X.head()

Unnamed: 0,Date,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,Top9,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2008-08-08,"b""Georgia 'downs two Russian warplanes' as cou...",b'BREAKING: Musharraf to be impeached.',b'Russia Today: Columns of troops roll into So...,b'Russian tanks are moving towards the capital...,"b""Afghan children raped with 'impunity,' U.N. ...",b'150 Russian tanks have entered South Ossetia...,"b""Breaking: Georgia invades South Ossetia, Rus...","b""The 'enemy combatent' trials are nothing but...",b'Georgian troops retreat from S. Osettain cap...,...,b'Georgia Invades South Ossetia - if Russia ge...,b'Al-Qaeda Faces Islamist Backlash',"b'Condoleezza Rice: ""The US would not act to p...",b'This is a busy day: The European Union has ...,"b""Georgia will withdraw 1,000 soldiers from Ir...",b'Why the Pentagon Thinks Attacking Iran is a ...,b'Caucasus in crisis: Georgia invades South Os...,b'Indian shoe manufactory - And again in a se...,b'Visitors Suffering from Mental Illnesses Ban...,"b""No Help for Mexico's Kidnapping Surge"""
1,2008-08-11,b'Why wont America and Nato help us? If they w...,b'Bush puts foot down on Georgian conflict',"b""Jewish Georgian minister: Thanks to Israeli ...",b'Georgian army flees in disarray as Russians ...,"b""Olympic opening ceremony fireworks 'faked'""",b'What were the Mossad with fraudulent New Zea...,b'Russia angered by Israeli military sale to G...,b'An American citizen living in S.Ossetia blam...,b'Welcome To World War IV! Now In High Definit...,...,b'Israel and the US behind the Georgian aggres...,"b'""Do not believe TV, neither Russian nor Geor...",b'Riots are still going on in Montreal (Canada...,b'China to overtake US as largest manufacturer',b'War in South Ossetia [PICS]',b'Israeli Physicians Group Condemns State Tort...,b' Russia has just beaten the United States ov...,b'Perhaps *the* question about the Georgia - R...,b'Russia is so much better at war',"b""So this is what it's come to: trading sex fo..."
2,2008-08-12,b'Remember that adorable 9-year-old who sang a...,"b""Russia 'ends Georgia operation'""","b'""If we had no sexual harassment we would hav...","b""Al-Qa'eda is losing support in Iraq because ...",b'Ceasefire in Georgia: Putin Outmaneuvers the...,b'Why Microsoft and Intel tried to kill the XO...,b'Stratfor: The Russo-Georgian War and the Bal...,"b""I'm Trying to Get a Sense of This Whole Geor...","b""The US military was surprised by the timing ...",...,b'U.S. troops still in Georgia (did you know t...,b'Why Russias response to Georgia was right',"b'Gorbachev accuses U.S. of making a ""serious ...","b'Russia, Georgia, and NATO: Cold War Two'",b'Remember that adorable 62-year-old who led y...,b'War in Georgia: The Israeli connection',b'All signs point to the US encouraging Georgi...,b'Christopher King argues that the US and NATO...,b'America: The New Mexico?',"b""BBC NEWS | Asia-Pacific | Extinction 'by man..."
3,2008-08-13,b' U.S. refuses Israel weapons to attack Iran:...,"b""When the president ordered to attack Tskhinv...",b' Israel clears troops who killed Reuters cam...,b'Britain\'s policy of being tough on drugs is...,b'Body of 14 year old found in trunk; Latest (...,b'China has moved 10 *million* quake survivors...,"b""Bush announces Operation Get All Up In Russi...",b'Russian forces sink Georgian ships ',"b""The commander of a Navy air reconnaissance s...",...,b'Elephants extinct by 2020?',b'US humanitarian missions soon in Georgia - i...,"b""Georgia's DDOS came from US sources""","b'Russian convoy heads into Georgia, violating...",b'Israeli defence minister: US against strike ...,b'Gorbachev: We Had No Choice',b'Witness: Russian forces head towards Tbilisi...,b' Quarter of Russians blame U.S. for conflict...,b'Georgian president says US military will ta...,b'2006: Nobel laureate Aleksander Solzhenitsyn...
4,2008-08-14,b'All the experts admit that we should legalis...,b'War in South Osetia - 89 pictures made by a ...,b'Swedish wrestler Ara Abrahamian throws away ...,b'Russia exaggerated the death toll in South O...,b'Missile That Killed 9 Inside Pakistan May Ha...,"b""Rushdie Condemns Random House's Refusal to P...",b'Poland and US agree to missle defense deal. ...,"b'Will the Russians conquer Tblisi? Bet on it,...",b'Russia exaggerating South Ossetian death tol...,...,b'Bank analyst forecast Georgian crisis 2 days...,"b""Georgia confict could set back Russia's US r...",b'War in the Caucasus is as much the product o...,"b'""Non-media"" photos of South Ossetia/Georgia ...",b'Georgian TV reporter shot by Russian sniper ...,b'Saudi Arabia: Mother moves to block child ma...,b'Taliban wages war on humanitarian aid workers',"b'Russia: World ""can forget about"" Georgia\'s...",b'Darfur rebels accuse Sudan of mounting major...,b'Philippines : Peace Advocate say Muslims nee...


In [117]:
# Display top 5 rows of y
y.head()

Unnamed: 0,Label
0,0
1,1
2,0
3,0
4,1


In [8]:
# Display X dataframe shape
X.shape

(1989, 26)

In [9]:
# Display y dataframe shape
y.shape

(1989,)

# Macro Analysis

In [10]:
# check for nulls
data.isnull().sum()
# Only 3 features with missing rows

Date     0
Label    0
Top1     0
Top2     0
Top3     0
Top4     0
Top5     0
Top6     0
Top7     0
Top8     0
Top9     0
Top10    0
Top11    0
Top12    0
Top13    0
Top14    0
Top15    0
Top16    0
Top17    0
Top18    0
Top19    0
Top20    0
Top21    0
Top22    0
Top23    1
Top24    3
Top25    3
dtype: int64

In [11]:
# See missing data in Top23
X[X['Top23'].isnull()]

Unnamed: 0,Date,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,Top9,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
277,2009-09-15,b'The Church of Scientology won\'t be dissolve...,b'New virus from rats can kill 80 per cent of ...,b'The gruesome spectacle of dolphins being sla...,b'The End of Innocence in Afghanistan: \'The G...,b'France approves Internet piracy bill',b'The Rural Doctors Association says right now...,b'Al Jazeera English - Africa - Shabab to aven...,"b""How Sri Lanka governs through detentions - S...",b'Two months after the Pakistani Army wrested ...,...,b'In an equine echo of the controversy surroun...,b'UPDATE: 5-New York homes raided in terrorism...,b'Population Growth Impeding Progress on the M...,b'Global Population to Reach 7 Billion by 2011',b'Government Funded Feminist Porn ',b'Can someone enlighten me re:Holy Land disput...,b'Human Rights Watch official suspended for co...,,,


In [12]:
# See missing data in Top24
X[X['Top24'].isnull()]

Unnamed: 0,Date,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,Top9,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
277,2009-09-15,b'The Church of Scientology won\'t be dissolve...,b'New virus from rats can kill 80 per cent of ...,b'The gruesome spectacle of dolphins being sla...,b'The End of Innocence in Afghanistan: \'The G...,b'France approves Internet piracy bill',b'The Rural Doctors Association says right now...,b'Al Jazeera English - Africa - Shabab to aven...,"b""How Sri Lanka governs through detentions - S...",b'Two months after the Pakistani Army wrested ...,...,b'In an equine echo of the controversy surroun...,b'UPDATE: 5-New York homes raided in terrorism...,b'Population Growth Impeding Progress on the M...,b'Global Population to Reach 7 Billion by 2011',b'Government Funded Feminist Porn ',b'Can someone enlighten me re:Holy Land disput...,b'Human Rights Watch official suspended for co...,,,
348,2009-12-24,b'Woman knocks down Pope Benedict at Christmas...,b'Ugandan President Museveni says he will bloc...,"b""Venezuela's Chavez threatens to kick out car...",b'Woman who knocked down pope had pasta in her...,"b'450 people from 21 countries, including 30 f...",b'Chvez declares Angel Falls is no more: World...,b'Drug tests catch out 10 police (UK)',"b'Once the worlds fourth-largest lake, the Ara...","b""Cool Saudi Feminist calls for woman's right ...",...,b'Pig farts spark Australia gas scare - I shit...,b'Bin Laden daughter flees to Saudi embassy in...,b'Traumatic Brain Injuries: Growing evidence ...,b'The Pakistan Supreme Court has ordered the P...,"b""Here's a video the Taliban released this mor...",b'Fireworks set off aboard airliner',"b'Five VA men may face terrorism charges, Paki...","b""Ayatollah Montazeri's Legacy: In death he m...",,
681,2011-04-21,"Director of ""Restrepo"" and Photographer Chris ...",Everyone within 20km of Fukushima will be forc...,Bahrain: Activist Zaynab Al-Khawaja's letter t...,"More than 80,000 people are claiming incapacit...",Libya: 'mission creep' claims as UK sends in m...,Japan considers banning access to evacuation zone,Chinese oil giant Sinopec has stopped exportin...,6.0-magnitude earthquake strikes off east coas...,Next war in line: At least 20 South Sudanese a...,...,Russian migration official fired in racism row...,Small amounts of radioactive iodine found in b...,"War photographer, Oscar-nominated film directo...",Effeminate boys are being sent to a special ca...,Japan Government Declares 12-Mile Area Around ...,TEPCO admitted Wednesday that nuclear fuel in ...,Fukushima evacuees face arrest if they return ...,Prince Charles wins some kind of a record,,


In [13]:
# See missing data in Top25
X[X['Top25'].isnull()]

Unnamed: 0,Date,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,Top9,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
277,2009-09-15,b'The Church of Scientology won\'t be dissolve...,b'New virus from rats can kill 80 per cent of ...,b'The gruesome spectacle of dolphins being sla...,b'The End of Innocence in Afghanistan: \'The G...,b'France approves Internet piracy bill',b'The Rural Doctors Association says right now...,b'Al Jazeera English - Africa - Shabab to aven...,"b""How Sri Lanka governs through detentions - S...",b'Two months after the Pakistani Army wrested ...,...,b'In an equine echo of the controversy surroun...,b'UPDATE: 5-New York homes raided in terrorism...,b'Population Growth Impeding Progress on the M...,b'Global Population to Reach 7 Billion by 2011',b'Government Funded Feminist Porn ',b'Can someone enlighten me re:Holy Land disput...,b'Human Rights Watch official suspended for co...,,,
348,2009-12-24,b'Woman knocks down Pope Benedict at Christmas...,b'Ugandan President Museveni says he will bloc...,"b""Venezuela's Chavez threatens to kick out car...",b'Woman who knocked down pope had pasta in her...,"b'450 people from 21 countries, including 30 f...",b'Chvez declares Angel Falls is no more: World...,b'Drug tests catch out 10 police (UK)',"b'Once the worlds fourth-largest lake, the Ara...","b""Cool Saudi Feminist calls for woman's right ...",...,b'Pig farts spark Australia gas scare - I shit...,b'Bin Laden daughter flees to Saudi embassy in...,b'Traumatic Brain Injuries: Growing evidence ...,b'The Pakistan Supreme Court has ordered the P...,"b""Here's a video the Taliban released this mor...",b'Fireworks set off aboard airliner',"b'Five VA men may face terrorism charges, Paki...","b""Ayatollah Montazeri's Legacy: In death he m...",,
681,2011-04-21,"Director of ""Restrepo"" and Photographer Chris ...",Everyone within 20km of Fukushima will be forc...,Bahrain: Activist Zaynab Al-Khawaja's letter t...,"More than 80,000 people are claiming incapacit...",Libya: 'mission creep' claims as UK sends in m...,Japan considers banning access to evacuation zone,Chinese oil giant Sinopec has stopped exportin...,6.0-magnitude earthquake strikes off east coas...,Next war in line: At least 20 South Sudanese a...,...,Russian migration official fired in racism row...,Small amounts of radioactive iodine found in b...,"War photographer, Oscar-nominated film directo...",Effeminate boys are being sent to a special ca...,Japan Government Declares 12-Mile Area Around ...,TEPCO admitted Wednesday that nuclear fuel in ...,Fukushima evacuees face arrest if they return ...,Prince Charles wins some kind of a record,,


In [14]:
# Convert NaN to blanks
X['Top23'].fillna('',inplace=True)
X['Top24'].fillna('',inplace=True)
X['Top25'].fillna('',inplace=True)
# check for nulls again
X.isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


Date     0
Top1     0
Top2     0
Top3     0
Top4     0
Top5     0
Top6     0
Top7     0
Top8     0
Top9     0
Top10    0
Top11    0
Top12    0
Top13    0
Top14    0
Top15    0
Top16    0
Top17    0
Top18    0
Top19    0
Top20    0
Top21    0
Top22    0
Top23    0
Top24    0
Top25    0
dtype: int64

In [15]:
# Check datatype for X
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1989 entries, 0 to 1988
Data columns (total 26 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Date    1989 non-null   object
 1   Top1    1989 non-null   object
 2   Top2    1989 non-null   object
 3   Top3    1989 non-null   object
 4   Top4    1989 non-null   object
 5   Top5    1989 non-null   object
 6   Top6    1989 non-null   object
 7   Top7    1989 non-null   object
 8   Top8    1989 non-null   object
 9   Top9    1989 non-null   object
 10  Top10   1989 non-null   object
 11  Top11   1989 non-null   object
 12  Top12   1989 non-null   object
 13  Top13   1989 non-null   object
 14  Top14   1989 non-null   object
 15  Top15   1989 non-null   object
 16  Top16   1989 non-null   object
 17  Top17   1989 non-null   object
 18  Top18   1989 non-null   object
 19  Top19   1989 non-null   object
 20  Top20   1989 non-null   object
 21  Top21   1989 non-null   object
 22  Top22   1989 non-null   

In [16]:
# Display first date and last date of data
print("First date: "+ X['Date'].min())
print("Last date: "+ X['Date'].max())

First date: 2008-08-08
Last date: 2016-07-01


In [17]:
# Check date
X['Date'].head(15)

0     2008-08-08
1     2008-08-11
2     2008-08-12
3     2008-08-13
4     2008-08-14
5     2008-08-15
6     2008-08-18
7     2008-08-19
8     2008-08-20
9     2008-08-21
10    2008-08-22
11    2008-08-25
12    2008-08-26
13    2008-08-27
14    2008-08-28
Name: Date, dtype: object

Notice that the observations data is not everyday. 8 August 2008 falls on a Friday. 
11 to 15 August 2008 falls on weekday. Same for 18 to 22 August 2008. This means that there is no stock trading on weekends. The New York Stock Exchange (NYSE)<sup>1</sup> is also closed on public holidays and special occassions such as mourning subject to announcement.

In [18]:
X.columns

Index(['Date', 'Top1', 'Top2', 'Top3', 'Top4', 'Top5', 'Top6', 'Top7', 'Top8',
       'Top9', 'Top10', 'Top11', 'Top12', 'Top13', 'Top14', 'Top15', 'Top16',
       'Top17', 'Top18', 'Top19', 'Top20', 'Top21', 'Top22', 'Top23', 'Top24',
       'Top25'],
      dtype='object')

In [19]:
# Create a combine feature
cols = X.columns[X.columns != 'Date']
temp_df = X[cols].apply('-'.join)

In [66]:
temp_df

Unnamed: 0,test
0,"b""Georgia 'downs two Russian warplanes' as cou..."
1,b'Why wont America and Nato help us? If they w...
2,b'Remember that adorable 9-year-old who sang a...
3,b' U.S. refuses Israel weapons to attack Iran:...
4,b'All the experts admit that we should legalis...
...,...
1984,Barclays and RBS shares suspended from trading...
1985,"2,500 Scientists To Australia: If You Want To ..."
1986,Explosion At Airport In Istanbul Yemeni former...
1987,Jamaica proposes marijuana dispensers for tour...


In [67]:
temp_df=pd.DataFrame()
temp_df['text'] = X[cols].apply(lambda x: ' '.join(x), axis = 1) 

In [52]:
X['Top1'][0]

'b"Georgia \'downs two Russian warplanes\' as countries move to brink of war"'

In [53]:
X['Top2'][0]

"b'BREAKING: Musharraf to be impeached.'"

In [54]:
X['Top3'][0]

"b'Russia Today: Columns of troops roll into South Ossetia; footage from fighting (YouTube)'"

In [68]:
temp_df['text'][0]

'b"Georgia \'downs two Russian warplanes\' as countries move to brink of war" b\'BREAKING: Musharraf to be impeached.\' b\'Russia Today: Columns of troops roll into South Ossetia; footage from fighting (YouTube)\' b\'Russian tanks are moving towards the capital of South Ossetia, which has reportedly been completely destroyed by Georgian artillery fire\' b"Afghan children raped with \'impunity,\' U.N. official says - this is sick, a three year old was raped and they do nothing" b\'150 Russian tanks have entered South Ossetia whilst Georgia shoots down two Russian jets.\' b"Breaking: Georgia invades South Ossetia, Russia warned it would intervene on SO\'s side" b"The \'enemy combatent\' trials are nothing but a sham: Salim Haman has been sentenced to 5 1/2 years, but will be kept longer anyway just because they feel like it." b\'Georgian troops retreat from S. Osettain capital, presumably leaving several hundred people killed. [VIDEO]\' b\'Did the U.S. Prep Georgia for War with Russia?\'

temp

In [22]:
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup           
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import collections
import itertools
import pandas as pd
import re
import math
import numpy as np

In [29]:
def posts_to_words(subreddit):
    # Function to convert a raw post to a string of words
    # The input is a single string (a raw post of a subreddit), and 
    # the output is a single string (a preprocessed post)
    
    # 1. Remove HTML, if any
    post_text = BeautifulSoup(subreddit).get_text()
    
    # 2. Remove non-letters.
    letters_only = re.sub("[^a-zA-Z]", " ", post_text)
    
    # 3. Convert to lower case, split into individual words.
    words = letters_only.lower().split()
    
    # 4. Remove stopwords.
    meaningful_words = [w for w in words if w not in STOPWORDS]
    
    # 5. Lemmatize words
    # Chose lemmatize over stemming because it's less crude
    # Instantiate lemmatizer. (
    lemmatizer = WordNetLemmatizer()
    short_meaningful_words = [lemmatizer.lemmatize(words) for words in meaningful_words]

    # 6. Join the words back into one string separated by space, 
    # and return the result in order to put into countvectorizer later
    return(" ".join(short_meaningful_words))

In [24]:
X.head()

Unnamed: 0,Date,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,Top9,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2008-08-08,"b""Georgia 'downs two Russian warplanes' as cou...",b'BREAKING: Musharraf to be impeached.',b'Russia Today: Columns of troops roll into So...,b'Russian tanks are moving towards the capital...,"b""Afghan children raped with 'impunity,' U.N. ...",b'150 Russian tanks have entered South Ossetia...,"b""Breaking: Georgia invades South Ossetia, Rus...","b""The 'enemy combatent' trials are nothing but...",b'Georgian troops retreat from S. Osettain cap...,...,b'Georgia Invades South Ossetia - if Russia ge...,b'Al-Qaeda Faces Islamist Backlash',"b'Condoleezza Rice: ""The US would not act to p...",b'This is a busy day: The European Union has ...,"b""Georgia will withdraw 1,000 soldiers from Ir...",b'Why the Pentagon Thinks Attacking Iran is a ...,b'Caucasus in crisis: Georgia invades South Os...,b'Indian shoe manufactory - And again in a se...,b'Visitors Suffering from Mental Illnesses Ban...,"b""No Help for Mexico's Kidnapping Surge"""
1,2008-08-11,b'Why wont America and Nato help us? If they w...,b'Bush puts foot down on Georgian conflict',"b""Jewish Georgian minister: Thanks to Israeli ...",b'Georgian army flees in disarray as Russians ...,"b""Olympic opening ceremony fireworks 'faked'""",b'What were the Mossad with fraudulent New Zea...,b'Russia angered by Israeli military sale to G...,b'An American citizen living in S.Ossetia blam...,b'Welcome To World War IV! Now In High Definit...,...,b'Israel and the US behind the Georgian aggres...,"b'""Do not believe TV, neither Russian nor Geor...",b'Riots are still going on in Montreal (Canada...,b'China to overtake US as largest manufacturer',b'War in South Ossetia [PICS]',b'Israeli Physicians Group Condemns State Tort...,b' Russia has just beaten the United States ov...,b'Perhaps *the* question about the Georgia - R...,b'Russia is so much better at war',"b""So this is what it's come to: trading sex fo..."
2,2008-08-12,b'Remember that adorable 9-year-old who sang a...,"b""Russia 'ends Georgia operation'""","b'""If we had no sexual harassment we would hav...","b""Al-Qa'eda is losing support in Iraq because ...",b'Ceasefire in Georgia: Putin Outmaneuvers the...,b'Why Microsoft and Intel tried to kill the XO...,b'Stratfor: The Russo-Georgian War and the Bal...,"b""I'm Trying to Get a Sense of This Whole Geor...","b""The US military was surprised by the timing ...",...,b'U.S. troops still in Georgia (did you know t...,b'Why Russias response to Georgia was right',"b'Gorbachev accuses U.S. of making a ""serious ...","b'Russia, Georgia, and NATO: Cold War Two'",b'Remember that adorable 62-year-old who led y...,b'War in Georgia: The Israeli connection',b'All signs point to the US encouraging Georgi...,b'Christopher King argues that the US and NATO...,b'America: The New Mexico?',"b""BBC NEWS | Asia-Pacific | Extinction 'by man..."
3,2008-08-13,b' U.S. refuses Israel weapons to attack Iran:...,"b""When the president ordered to attack Tskhinv...",b' Israel clears troops who killed Reuters cam...,b'Britain\'s policy of being tough on drugs is...,b'Body of 14 year old found in trunk; Latest (...,b'China has moved 10 *million* quake survivors...,"b""Bush announces Operation Get All Up In Russi...",b'Russian forces sink Georgian ships ',"b""The commander of a Navy air reconnaissance s...",...,b'Elephants extinct by 2020?',b'US humanitarian missions soon in Georgia - i...,"b""Georgia's DDOS came from US sources""","b'Russian convoy heads into Georgia, violating...",b'Israeli defence minister: US against strike ...,b'Gorbachev: We Had No Choice',b'Witness: Russian forces head towards Tbilisi...,b' Quarter of Russians blame U.S. for conflict...,b'Georgian president says US military will ta...,b'2006: Nobel laureate Aleksander Solzhenitsyn...
4,2008-08-14,b'All the experts admit that we should legalis...,b'War in South Osetia - 89 pictures made by a ...,b'Swedish wrestler Ara Abrahamian throws away ...,b'Russia exaggerated the death toll in South O...,b'Missile That Killed 9 Inside Pakistan May Ha...,"b""Rushdie Condemns Random House's Refusal to P...",b'Poland and US agree to missle defense deal. ...,"b'Will the Russians conquer Tblisi? Bet on it,...",b'Russia exaggerating South Ossetian death tol...,...,b'Bank analyst forecast Georgian crisis 2 days...,"b""Georgia confict could set back Russia's US r...",b'War in the Caucasus is as much the product o...,"b'""Non-media"" photos of South Ossetia/Georgia ...",b'Georgian TV reporter shot by Russian sniper ...,b'Saudi Arabia: Mother moves to block child ma...,b'Taliban wages war on humanitarian aid workers',"b'Russia: World ""can forget about"" Georgia\'s...",b'Darfur rebels accuse Sudan of mounting major...,b'Philippines : Peace Advocate say Muslims nee...


In [25]:
temp_X = X[['Date','Top1']]

In [26]:
temp_X.head()

Unnamed: 0,Date,Top1
0,2008-08-08,"b""Georgia 'downs two Russian warplanes' as cou..."
1,2008-08-11,b'Why wont America and Nato help us? If they w...
2,2008-08-12,b'Remember that adorable 9-year-old who sang a...
3,2008-08-13,b' U.S. refuses Israel weapons to attack Iran:...
4,2008-08-14,b'All the experts admit that we should legalis...


In [69]:
clean_train_posts = []

for posts in tqdm(temp_df['text']):
    # Convert posts to words, then append to clean_train_posts.
    clean_train_posts.append(posts_to_words(posts))

HBox(children=(FloatProgress(value=0.0, max=1989.0), HTML(value='')))




In [70]:
clean_train_posts

['b georgia down two russian warplane country move brink war b breaking musharraf impeached b russia today column troop roll south ossetia footage fighting youtube b russian tank moving towards capital south ossetia reportedly completely destroyed georgian artillery fire b afghan child raped impunity u n official say sick three year old raped nothing b russian tank entered south ossetia whilst georgia shoot two russian jet b breaking georgia invades south ossetia russia warned intervene s side b enemy combatent trial nothing sham salim haman sentenced year will kept longer anyway feel b georgian troop retreat s osettain capital presumably leaving several hundred people killed video b u s prep georgia war russia b rice give green light israel attack iran say u s veto israeli military ops b announcing class action lawsuit behalf american public fbi b russia georgia war nyt s top story opening ceremony olympics fucking disgrace yet proof decline journalism b china tell bush stay country a

In [71]:
# Create a list of lists containing lowercase words for each posts
# do a simple preprocesing split and lowercase the words using .lower and .split
words_in_casualconv = [post.lower().split() for post in clean_train_posts]
# Flatten your list, so that all words across the posts are in one list
# List of all words across posts
all_words_casualconv = list(itertools.chain(*words_in_casualconv))

# To get the count of how many times each word appears in the sample
# I can use the built-in Python library 'collections'
counts_casualconv = collections.Counter(all_words_casualconv)

# The 'collection.Counter' object has 'most_common' that will return the most commonly used words and the number of times that they are used.
print("Most common 150 words for CasualConversation:\n")
print(counts_casualconv.most_common(150))

Most common 150 words for CasualConversation:

[('b', 12036), ('s', 10535), ('u', 4945), ('say', 3416), ('year', 3149), ('world', 2462), ('new', 2426), ('will', 2331), ('government', 2228), ('china', 2223), ('israel', 2214), ('police', 1966), ('people', 1928), ('country', 1863), ('russia', 1767), ('state', 1740), ('war', 1652), ('woman', 1490), ('israeli', 1486), ('one', 1470), ('attack', 1461), ('president', 1453), ('uk', 1443), ('right', 1419), ('iran', 1395), ('first', 1339), ('russian', 1330), ('killed', 1330), ('north', 1272), ('child', 1258), ('korea', 1236), ('two', 1193), ('said', 1187), ('law', 1187), ('death', 1185), ('report', 1178), ('time', 1158), ('million', 1155), ('t', 1152), ('now', 1151), ('minister', 1137), ('military', 1135), ('protest', 1126), ('court', 1092), ('official', 1091), ('un', 1024), ('bank', 1014), ('palestinian', 1013), ('force', 1012), ('news', 1012), ('nuclear', 1001), ('found', 988), ('man', 977), ('drug', 977), ('syria', 975), ('chinese', 968), ('so

In [72]:
# Instantiate our CountVectorizer.
cvec = CountVectorizer(max_features=500)

In [73]:
X_train_cvec = pd.DataFrame(cvec.fit_transform(clean_train_posts).todense(),columns=cvec.get_feature_names()) 

In [74]:
X_train_cvec.head()

Unnamed: 0,abuse,access,according,accused,across,act,action,activist,afghan,afghanistan,...,will,win,without,woman,work,worker,world,year,young,zealand
0,0,0,0,0,0,1,1,0,1,0,...,3,0,0,0,1,0,2,2,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,1,0,1,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,2,0,0
3,0,0,0,0,0,0,1,0,0,0,...,2,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,2,1,0,0


In [37]:
y.value_counts(normalize=True)

1    0.535445
0    0.464555
Name: Label, dtype: float64

In [75]:
def BestModel(X,y):
    # empty list to store results
    score_list = []
    model_list = []
    test_size_list = []
    vectorizer_list = []
    
    # Set 3 different test size for train_test_split
    x = [0.3,0.25,0.20]
    
    # models to try
    models = [
        MultinomialNB(),
        LogisticRegression(),
        RandomForestClassifier(max_depth=None, n_estimators= 300), #from gridsearchcv result earlier
        ExtraTreesClassifier()
    ]
    
    # vectorizer to try
    vectorizer = [
        CountVectorizer(max_features=500),
        TfidfVectorizer(max_features=500)
    ]
    
    for test_size in x:
        
        # train test split
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,test_size=test_size,random_state=42) 
    
         # Initialize an empty list to hold the clean posts.
        clean_train_posts = []
        clean_test_posts = []

        # For every post in our training set
        for posts in X_train['text']:
            # Convert posts to words, then append to clean_train_posts.
            clean_train_posts.append(posts_to_words(posts))
    
        # For every post in our testing set
        for posts in X_test['text']:

        # Convert posts to words, then append to clean_test_posts.
            clean_test_posts.append(posts_to_words(posts))
     
        # Instantiate Vectorizer.
        for vectorize in vectorizer:
            
            # Fit and transform Vectorizer on the training data 
            # .todense make it to array from sparse matrix
            X_train_cvec = pd.DataFrame(vectorize.fit_transform(clean_train_posts).todense(),columns=vectorize.get_feature_names()) 
       
            # Transform our testing data with the already-fit Vectorizer.
            X_test_cvec = pd.DataFrame(vectorize.transform(clean_test_posts).todense(), columns=vectorize.get_feature_names())
        
            
            # Fit and score the models
            for model in models:
                model.fit(X_train_cvec, y_train)
                score = model.score(X_test_cvec, y_test)
                # store results
                model_list.append(model)
                score_list.append(score)
                test_size_list.append(test_size)
                vectorizer_list.append(vectorize)
    
    # combine results into dataframe
    df = pd.DataFrame(list(zip(model_list, score_list,test_size_list,vectorizer_list)), 
               columns =['Model', 'Score','Test Size','Vectorizer']) 
    return (df)

In [80]:
X=temp_df

In [81]:
X.head()

Unnamed: 0,text
0,"b""Georgia 'downs two Russian warplanes' as cou..."
1,b'Why wont America and Nato help us? If they w...
2,b'Remember that adorable 9-year-old who sang a...
3,b' U.S. refuses Israel weapons to attack Iran:...
4,b'All the experts admit that we should legalis...


In [82]:
df = BestModel(X,y)
df.head()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Unnamed: 0,Model,Score,Test Size,Vectorizer
0,"MultinomialNB(alpha=1.0, class_prior=None, fit...",0.495812,0.3,"CountVectorizer(analyzer='word', binary=False,..."
1,"LogisticRegression(C=1.0, class_weight=None, d...",0.494137,0.3,"CountVectorizer(analyzer='word', binary=False,..."
2,"(DecisionTreeClassifier(ccp_alpha=0.0, class_w...",0.517588,0.3,"CountVectorizer(analyzer='word', binary=False,..."
3,"(ExtraTreeClassifier(ccp_alpha=0.0, class_weig...",0.519263,0.3,"CountVectorizer(analyzer='word', binary=False,..."
4,"MultinomialNB(alpha=1.0, class_prior=None, fit...",0.517588,0.3,"TfidfVectorizer(analyzer='word', binary=False,..."


In [83]:
# using test size of 0.2 decided earlier from the BestModel function
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,test_size=0.2,random_state=42) 
    

In [85]:
# Initialize an empty list to hold the clean posts.
clean_train_posts = []
clean_test_posts = []

print("Cleaning and parsing posts for training set:")
# For every post in our training set
for posts in tqdm(X_train['text']):
    # Convert posts to words, then append to clean_train_posts.
    clean_train_posts.append(posts_to_words(posts))
    
print("Cleaning and parsing posts for training set:")
# For every post in our testingcv set
for posts in tqdm(X_test['text']):

    # Convert posts to words, then append to clean_test_posts.
    clean_test_posts.append(posts_to_words(posts))
    
# Instantiate the transformer.
tvec = TfidfVectorizer()

# Fit and transform TfidfVectorizer on the training data 
# .todense make it to array from sparse matrix
X_train_tfid = pd.DataFrame(tvec.fit_transform(clean_train_posts).todense(),columns=tvec.get_feature_names()) 
       
# Transform our testing data with the already-fit TfidfVectorizer.
X_test_tfid = pd.DataFrame(tvec.transform(clean_test_posts).todense(), columns=tvec.get_feature_names())
        

Cleaning and parsing posts for training set:


HBox(children=(FloatProgress(value=0.0, max=1591.0), HTML(value='')))


Cleaning and parsing posts for training set:


HBox(children=(FloatProgress(value=0.0, max=398.0), HTML(value='')))




In [86]:
# Instantiate our model
logreg_final = LogisticRegression()

In [87]:
# Fit Logistic Regression model
logreg_final.fit(X_train_tfid,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [89]:
#Score our model on the testing set
# This is Accuracy score 
print(f"The best test score is: {round(logreg_final.score(X_test_tfid,y_test),3)*100} %")

The best test score is: 50.3 %


In [90]:
# keep the coefficients in dataframe
coefficients = pd.concat([pd.DataFrame(X_test_tfid.columns),pd.DataFrame(np.transpose(logreg_final.coef_))], axis = 1)
coefficients.columns=['word','coef']
coefficients['exp_coef'] = coefficients['coef'].map(lambda x: math.exp(x))
coefficients.head()

Unnamed: 0,word,coef,exp_coef
0,aa,0.04614,1.047221
1,aaa,0.04844,1.049632
2,aaaw,-0.046579,0.954489
3,aab,-0.043049,0.957864
4,aabo,0.035117,1.03574


In [99]:
y_pred = pd.DataFrame(logreg_final.predict(X_test_tfid))
y_pred.head()


Unnamed: 0,0
0,1
1,1
2,1
3,1
4,0


In [109]:
y_pred[0].value_counts(normalize=True)

1    0.751256
0    0.248744
Name: 0, dtype: float64

In [104]:
result = pd.concat([y_pred,y],axis=1)
result.head()

Unnamed: 0,0,Label
0,1.0,0
1,1.0,1
2,1.0,0
3,1.0,0
4,0.0,1


In [91]:
coef_parenting = coefficients.sort_values(by='coef',ascending=False)[:100]
coef_parenting.head()

Unnamed: 0,word,coef,exp_coef
12230,jew,0.818486,2.267065
24237,uk,0.725231,2.065207
18949,rebel,0.621943,1.862543
19036,record,0.62032,1.859524
23328,territory,0.586332,1.797383


# Sources

<sup>1</sup> https://www.investors.com/research/stock-market-holidays/