# DS-6014 Bayesian Machine Learning Final Project
#### Haizhu Hong, Yiran Zheng

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import nltk
from nltk.corpus import stopwords
import json 

### load the data

In [2]:
# historical news headlines from Reddit WorldNews Channel (/r/worldnews). 
#They are ranked by reddit users' votes, and only the top 25 headlines are considered for a single date.
news=pd.read_csv("../data/RedditNews.csv")
news.head()

Unnamed: 0,Date,News
0,2016-07-01,A 117-year-old woman in Mexico City finally re...
1,2016-07-01,IMF chief backs Athens as permanent Olympic host
2,2016-07-01,"The president of France says if Brexit won, so..."
3,2016-07-01,British Man Who Must Give Police 24 Hours' Not...
4,2016-07-01,100+ Nobel laureates urge Greenpeace to stop o...


In [3]:
#Dow Jones Industrial Average (DJIA) is used to "prove the concept".
stock=pd.read_csv("../data/upload_DJIA_table.csv")
stock.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,2016-07-01,17924.240234,18002.380859,17916.910156,17949.369141,82160000,17949.369141
1,2016-06-30,17712.759766,17930.609375,17711.800781,17929.990234,133030000,17929.990234
2,2016-06-29,17456.019531,17704.509766,17456.019531,17694.679688,106380000,17694.679688
3,2016-06-28,17190.509766,17409.720703,17190.509766,17409.720703,112190000,17409.720703
4,2016-06-27,17355.210938,17355.210938,17063.080078,17140.240234,138740000,17140.240234


In [4]:
#date is the date and followed by top25 topics in redditnews,
#label is binary classification:
#"1" when DJIA Adj Close value rose or stayed as the same;
#"0" when DJIA Adj Close value decreased.

combined=pd.read_csv("../data/Combined_News_DJIA.csv")
combined.head()

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2008-08-08,0,"b""Georgia 'downs two Russian warplanes' as cou...",b'BREAKING: Musharraf to be impeached.',b'Russia Today: Columns of troops roll into So...,b'Russian tanks are moving towards the capital...,"b""Afghan children raped with 'impunity,' U.N. ...",b'150 Russian tanks have entered South Ossetia...,"b""Breaking: Georgia invades South Ossetia, Rus...","b""The 'enemy combatent' trials are nothing but...",...,b'Georgia Invades South Ossetia - if Russia ge...,b'Al-Qaeda Faces Islamist Backlash',"b'Condoleezza Rice: ""The US would not act to p...",b'This is a busy day: The European Union has ...,"b""Georgia will withdraw 1,000 soldiers from Ir...",b'Why the Pentagon Thinks Attacking Iran is a ...,b'Caucasus in crisis: Georgia invades South Os...,b'Indian shoe manufactory - And again in a se...,b'Visitors Suffering from Mental Illnesses Ban...,"b""No Help for Mexico's Kidnapping Surge"""
1,2008-08-11,1,b'Why wont America and Nato help us? If they w...,b'Bush puts foot down on Georgian conflict',"b""Jewish Georgian minister: Thanks to Israeli ...",b'Georgian army flees in disarray as Russians ...,"b""Olympic opening ceremony fireworks 'faked'""",b'What were the Mossad with fraudulent New Zea...,b'Russia angered by Israeli military sale to G...,b'An American citizen living in S.Ossetia blam...,...,b'Israel and the US behind the Georgian aggres...,"b'""Do not believe TV, neither Russian nor Geor...",b'Riots are still going on in Montreal (Canada...,b'China to overtake US as largest manufacturer',b'War in South Ossetia [PICS]',b'Israeli Physicians Group Condemns State Tort...,b' Russia has just beaten the United States ov...,b'Perhaps *the* question about the Georgia - R...,b'Russia is so much better at war',"b""So this is what it's come to: trading sex fo..."
2,2008-08-12,0,b'Remember that adorable 9-year-old who sang a...,"b""Russia 'ends Georgia operation'""","b'""If we had no sexual harassment we would hav...","b""Al-Qa'eda is losing support in Iraq because ...",b'Ceasefire in Georgia: Putin Outmaneuvers the...,b'Why Microsoft and Intel tried to kill the XO...,b'Stratfor: The Russo-Georgian War and the Bal...,"b""I'm Trying to Get a Sense of This Whole Geor...",...,b'U.S. troops still in Georgia (did you know t...,b'Why Russias response to Georgia was right',"b'Gorbachev accuses U.S. of making a ""serious ...","b'Russia, Georgia, and NATO: Cold War Two'",b'Remember that adorable 62-year-old who led y...,b'War in Georgia: The Israeli connection',b'All signs point to the US encouraging Georgi...,b'Christopher King argues that the US and NATO...,b'America: The New Mexico?',"b""BBC NEWS | Asia-Pacific | Extinction 'by man..."
3,2008-08-13,0,b' U.S. refuses Israel weapons to attack Iran:...,"b""When the president ordered to attack Tskhinv...",b' Israel clears troops who killed Reuters cam...,b'Britain\'s policy of being tough on drugs is...,b'Body of 14 year old found in trunk; Latest (...,b'China has moved 10 *million* quake survivors...,"b""Bush announces Operation Get All Up In Russi...",b'Russian forces sink Georgian ships ',...,b'Elephants extinct by 2020?',b'US humanitarian missions soon in Georgia - i...,"b""Georgia's DDOS came from US sources""","b'Russian convoy heads into Georgia, violating...",b'Israeli defence minister: US against strike ...,b'Gorbachev: We Had No Choice',b'Witness: Russian forces head towards Tbilisi...,b' Quarter of Russians blame U.S. for conflict...,b'Georgian president says US military will ta...,b'2006: Nobel laureate Aleksander Solzhenitsyn...
4,2008-08-14,1,b'All the experts admit that we should legalis...,b'War in South Osetia - 89 pictures made by a ...,b'Swedish wrestler Ara Abrahamian throws away ...,b'Russia exaggerated the death toll in South O...,b'Missile That Killed 9 Inside Pakistan May Ha...,"b""Rushdie Condemns Random House's Refusal to P...",b'Poland and US agree to missle defense deal. ...,"b'Will the Russians conquer Tblisi? Bet on it,...",...,b'Bank analyst forecast Georgian crisis 2 days...,"b""Georgia confict could set back Russia's US r...",b'War in the Caucasus is as much the product o...,"b'""Non-media"" photos of South Ossetia/Georgia ...",b'Georgian TV reporter shot by Russian sniper ...,b'Saudi Arabia: Mother moves to block child ma...,b'Taliban wages war on humanitarian aid workers',"b'Russia: World ""can forget about"" Georgia\'s...",b'Darfur rebels accuse Sudan of mounting major...,b'Philippines : Peace Advocate say Muslims nee...


### data cleaning and preprocessing

### First check if there is any Null value in the data

In [5]:
news.isnull().values.any()

False

In [6]:
stock.isnull().values.any()

False

In [7]:
combined.isnull().values.any()

True

In [8]:
#try to locate the null value in the combined data
combined.isnull().sum()

Date     0
Label    0
Top1     0
Top2     0
Top3     0
Top4     0
Top5     0
Top6     0
Top7     0
Top8     0
Top9     0
Top10    0
Top11    0
Top12    0
Top13    0
Top14    0
Top15    0
Top16    0
Top17    0
Top18    0
Top19    0
Top20    0
Top21    0
Top22    0
Top23    1
Top24    3
Top25    3
dtype: int64

In [9]:
# filling the null values with " "
combined['Top23'].fillna(" ",inplace=True)
combined['Top24'].fillna(" ",inplace=True)
combined['Top25'].fillna(" ",inplace=True)

In [10]:
# Set stop words
stopWords = set(stopwords.words('english'))
# This preprocessing step just removes stopwords
def preprocessor(text): 
    tokens = nltk.word_tokenize(text)
    return (" ").join([word for word in tokens if word not in stopWords])

In [11]:
#remove punctuation and lower case all the news headlines
for i in combined.columns:
    if i=='Date':
        continue
    if i=='Label':
        continue
    combined[i].replace("[^a-zA-Z]"," ",regex=True, inplace=True)
    combined[i]= combined[i].str.lower()

In [14]:
Tops=[x for x in combined.columns if x.startswith("Top")]
for Top in Tops:
    combined[Top]= combined[Top].apply(preprocessor)
combined[Tops].head()

Unnamed: 0,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,Top9,Top10,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,b georgia downs two russian warplanes countrie...,b breaking musharraf impeached,b russia today columns troops roll south osset...,b russian tanks moving towards capital south o...,b afghan children raped impunity u n official ...,b russian tanks entered south ossetia whilst g...,b breaking georgia invades south ossetia russi...,b enemy combatent trials nothing sham salim ha...,b georgian troops retreat osettain capital pre...,b u prep georgia war russia,...,b georgia invades south ossetia russia gets in...,b al qaeda faces islamist backlash,b condoleezza rice us would act prevent israel...,b busy day european union approved new sanctio...,b georgia withdraw soldiers iraq help fight ru...,b pentagon thinks attacking iran bad idea us n...,b caucasus crisis georgia invades south ossetia,b indian shoe manufactory series like work,b visitors suffering mental illnesses banned o...,b help mexico kidnapping surge
1,b wont america nato help us wont help us help ...,b bush puts foot georgian conflict,b jewish georgian minister thanks israeli trai...,b georgian army flees disarray russians advanc...,b olympic opening ceremony fireworks faked,b mossad fraudulent new zealand passports iraq,b russia angered israeli military sale georgia,b american citizen living ossetia blames u geo...,b welcome world war iv high definition,b georgia move mistake monumental proportions,...,b israel us behind georgian aggression,b believe tv neither russian georgian much vic...,b riots still going montreal canada police mur...,b china overtake us largest manufacturer,b war south ossetia pics,b israeli physicians group condemns state torture,b russia beaten united states head peak oil,b perhaps question georgia russia conflict,b russia much better war,b come trading sex food
2,b remember adorable year old sang opening cere...,b russia ends georgia operation,b sexual harassment would children,b al qa eda losing support iraq brutal crackdo...,b ceasefire georgia putin outmaneuvers west,b microsoft intel tried kill xo laptop,b stratfor russo georgian war balance power,b trying get sense whole georgia russia war vo...,b us military surprised timing swiftness russi...,b u beats war drum iran dumps dollar,...,b u troops still georgia know georgia first place,b russias response georgia right,b gorbachev accuses u making serious blunder p...,b russia georgia nato cold war two,b remember adorable year old led country war b...,b war georgia israeli connection,b signs point us encouraging georgia invade so...,b christopher king argues us nato behind georg...,b america new mexico,b bbc news asia pacific extinction man climate
3,b u refuses israel weapons attack iran report,b president ordered attack tskhinvali capital ...,b israel clears troops killed reuters cameraman,b britain policy tough drugs pointless says fo...,b body year old found trunk latest ransom paid...,b china moved million quake survivors prefab h...,b bush announces operation get russia grill ye...,b russian forces sink georgian ships,b commander navy air reconnaissance squadron p...,b cnn readers russia actions georgia justified,...,b elephants extinct,b us humanitarian missions soon georgia russia...,b georgia ddos came us sources,b russian convoy heads georgia violating truce,b israeli defence minister us strike iran,b gorbachev choice,b witness russian forces head towards tbilisi ...,b quarter russians blame u conflict poll,b georgian president says us military take con...,b nobel laureate aleksander solzhenitsyn accus...
4,b experts admit legalise drugs,b war south osetia pictures made russian soldier,b swedish wrestler ara abrahamian throws away ...,b russia exaggerated death toll south ossetia ...,b missile killed inside pakistan may launched cia,b rushdie condemns random house refusal publis...,b poland us agree missle defense deal interest...,b russians conquer tblisi bet seriously bet,b russia exaggerating south ossetian death tol...,b musharraf expected resign rather face impeac...,...,b bank analyst forecast georgian crisis days e...,b georgia confict could set back russia us rel...,b war caucasus much product american imperial ...,b non media photos south ossetia georgia conflict,b georgian tv reporter shot russian sniper liv...,b saudi arabia mother moves block child marriage,b taliban wages war humanitarian aid workers,b russia world forget georgia territorial inte...,b darfur rebels accuse sudan mounting major at...,b philippines peace advocate say muslims need ...


In [None]:
class LDA_trains:
    """Creates a class for Latent Dirichlet Allocation using headlines in redditnews
    Input:
        reports = list of headlines
        N_topics = number of topics for LDA to produce
        N_words = the number of words to show in a topic
        new_report = narrative for a new accident report not in the training set
    Methods:
        Topics = Print the list of topics in the selected headlines
        Predict_Topics = Show the predicted probabilities for topics for a new headlines"""
    
    def __init__(self, headlines, N_topics=3, N_words = 10):
        # the news headlines
        self.headlines = headlines
        # initialize variables
        self.N_topics = N_topics
        self.N_words = N_words
        
        # Get the word counts in the reports
        self.countVectorizer = CountVectorizer(stop_words='english')
        self.termFrequency = self.countVectorizer.fit_transform(self.reports)
        self.Words = self.countVectorizer.get_feature_names()
        
    def Topics(self):
        # Obtain the estimates for the LDA model
        
        # Obtain the estimates for the LDA model 
        self.lda = LatentDirichletAllocation(n_components=self.N_topics)
        self.lda.fit(self.termFrequency)
        
        # For each of the topics in the model add the top N_words the list of topics
        topics = list()
        for topic in self.lda.components_:
            topics.append([self.Words[i] for i in topic.argsort()[:-self.N_words - 1:-1]])
        # Create column names for the output matrix
        cols = list()
        for i in range(self.N_words):
            cols.append("Word "+(str(i)))
            
        # Create a dataframe with the topic no. and the words in each topic 
        # output this dataframe 
        Topics_df =pd.DataFrame(topics, columns = cols)
        Topics_df.index.name = "Topics"         ### Your code here
        return Topics_df
    
    def Predict_Topics(self, new_headlines):
        self.new_headlines = new_headlines
        
        # Get the list of new accident report narratives
        # and the number of new narratives
        N_new_headlines = len(self.new_headlines)
        
        
        # For each of the new narratives 
        # obtain the estimated probabilities for each of the topics
        # in each of the new narratives as estimated by the LDA results
        # on the training set 
        new_headlines_topics = list()
        for title in self.new_headlines:
            new_headlines = preprocessor(title)
            new_headlines_topics.append(self.lda.transform(self.countVectorizer.transform([new_headlines])))       
        
        # Recast the list of probabilities for topics as an array 
        # of size no. of new reports X no. of topics
        new_headlines_topics = np.array(new_headlines_topics).reshape(N_new_headlines, self.N_topics)
        
        # Create column names for the output dataframe
        cols = list()
        for i in range(self.N_topics):
            cols.append("Topic "+(str(i)))        
        
        # Create the dataframe whose rows contain topic probabilities for 
        # specificed narratives/reports
        New_Reports_df = pd.DataFrame(new_headlines_topics, columns = cols )
        New_Reports_df.insert(0, 'reports', self.new_reports)       
        
        return New_Reports_df
                