In [138]:
import pandas as pd
import numpy as np

import nltk
import gensim
import json
import spacy
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import SnowballStemmer

import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt

df = pd.read_csv('news_data.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
stop = stopwords.words('english')
stemmer = SnowballStemmer('english')

In [139]:
df.columns

Index(['id', 'page_id', 'name', 'message', 'description', 'caption',
       'post_type', 'status_type', 'likes_count', 'comments_count',
       'shares_count', 'love_count', 'wow_count', 'haha_count', 'sad_count',
       'thankful_count', 'angry_count', 'link', 'picture', 'posted_at',
       'source'],
      dtype='object')

In [140]:
df.head()

Unnamed: 0,id,page_id,name,message,description,caption,post_type,status_type,likes_count,comments_count,...,love_count,wow_count,haha_count,sad_count,thankful_count,angry_count,link,picture,posted_at,source
0,"﻿""86680728811_272953252761568""",86680728811,Chief Justice Roberts Responds to Judicial Eth...,Roberts took the unusual step of devoting the ...,PAUL J. RICHARDS/AFP/Getty Images Chief Justic...,abcnews.go.com,link,shared_story,61,27,...,0,0,0,0,0,0,http://abcnews.go.com/blogs/headlines/2011/12/...,https://external.xx.fbcdn.net/safe_image.php?d...,2012-01-01 00:30:26,abc
1,"﻿""86680728811_273859942672742""",86680728811,"With Reservations, Obama Signs Act to Allow De...",Do you agree with the new law?,"In his last official act of business in 2011, ...",abcnews.go.com,link,shared_story,120,523,...,0,0,0,0,0,0,http://abcnews.go.com/blogs/politics/2011/12/w...,https://external.xx.fbcdn.net/safe_image.php?d...,2012-01-01 01:08:58,abc
2,"﻿""86680728811_10150499874478812""",86680728811,Wishes For 2012 to Fall on Times Square,Some pretty cool confetti will rain down on Ne...,The wishes of thousands of people will flutter...,abcnews.go.com,link,published_story,271,31,...,0,0,0,0,0,0,http://abcnews.go.com/blogs/headlines/2011/12/...,https://external.xx.fbcdn.net/safe_image.php?d...,2012-01-01 02:00:37,abc
3,"﻿""86680728811_244555465618151""",86680728811,Mitt Romney Vows to Veto Dream Act if President,,"Eric Gay/AP Photo SIOUX CITY, Iowa – Mitt Romn...",abcnews.go.com,link,shared_story,140,188,...,0,0,0,0,0,0,http://abcnews.go.com/blogs/politics/2011/12/m...,https://external.xx.fbcdn.net/safe_image.php?d...,2012-01-01 02:35:20,abc
4,"﻿""86680728811_252342804833247""",86680728811,"NY Pharmacy Shootout Leaves Suspect, ATF Agent...",The pharmacy was held up by a man seeking pres...,A shootout at a suburban New York family pharm...,abcnews.go.com,link,shared_story,59,51,...,0,0,0,0,0,0,http://abcnews.go.com/US/ny-pharmacy-shootout-...,https://external.xx.fbcdn.net/safe_image.php?d...,2012-01-01 03:36:01,abc


In [141]:
drop_columns = ['id', 'page_id', 'link', 'picture']
df.drop(drop_columns, axis=1,inplace=True)
df.shape

(286959, 17)

In [142]:
df.drop_duplicates(inplace=True)

In [143]:
text_data = ['name', 'message', 'description', 'caption']
quantitative = ['likes_count', 'comments_count', 'shares_count',
                'love_count','wow_count', 'haha_count', 'sad_count',
                'thankful_count', 'angry_count',]
descriptive_data = ['post_type', 'status_type', 'posted_at', 'source']

text_df = df.loc[:,text_data]
target_df = df.loc[:,quantitative]
quan_df = df.loc[:,descriptive_data]

### Text Processing

In [144]:
text_df

Unnamed: 0,name,message,description,caption
0,Chief Justice Roberts Responds to Judicial Eth...,Roberts took the unusual step of devoting the ...,PAUL J. RICHARDS/AFP/Getty Images Chief Justic...,abcnews.go.com
1,"With Reservations, Obama Signs Act to Allow De...",Do you agree with the new law?,"In his last official act of business in 2011, ...",abcnews.go.com
2,Wishes For 2012 to Fall on Times Square,Some pretty cool confetti will rain down on Ne...,The wishes of thousands of people will flutter...,abcnews.go.com
3,Mitt Romney Vows to Veto Dream Act if President,,"Eric Gay/AP Photo SIOUX CITY, Iowa – Mitt Romn...",abcnews.go.com
4,"NY Pharmacy Shootout Leaves Suspect, ATF Agent...",The pharmacy was held up by a man seeking pres...,A shootout at a suburban New York family pharm...,abcnews.go.com
...,...,...,...,...
286954,Donald Trump Says Tough Campaign Worth It Desp...,"“It took massive amounts of work, incredible a...","Donald Trump, in the last hours of his preside...",blogs.wsj.com
286955,Donald Trump Says Loss Would Spell ‘Single Gre...,"As he has many times, Donald J. Trump cast his...",Republican presidential nominee Donald Trump h...,wsj.com
286956,Donald Trump’s Loyal Numbers Man,"As the Trump Organization's finance chief, All...",Allen Weisselberg maintains a low-key profile ...,blogs.wsj.com
286957,HealthCare.gov Site Straining to Keep Up With ...,HealthCare.gov has been straining to handle th...,"Online “waiting rooms,” where people are sent ...",wsj.com


In [145]:
def stem(text):
    v_lemmed = [WordNetLemmatizer().lemmatize(word) for word in text if len(list(word))>4]
    return [stemmer.stem(word) for word in v_lemmed]

def lemmatization(texts):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc])
    return texts_out

def clean_text(post_df, column):
    post_df[column] = post_df[column].str.replace(r"[^\w\s]", " ", regex=True)
    post_df[column].fillna("-", inplace=True)
    post_df[column] = post_df[column].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
    post_df[column] = post_df[column].str.replace(r'http?.*.com?', ' ', regex=True)
    post_df[column] = post_df[column].str.split()
    post_df[column] = lemmatization(post_df[column])
    post_df[column] = post_df[column].apply(stem)

In [146]:
for col in text_df.columns:
    print(f"The {col} is being processed.")
    clean_text(text_df,col)

The name is being processed.


KeyboardInterrupt: 

In [None]:
text_df.head()

Unnamed: 0,name,message,description,caption
0,"[chief, justic, robert, respond, judici, ethic...","[robert, unusu, devot, major, annual, report, ...","[richard, getti, imag, chief, justic, robert, ...",[abcnew]
1,"[reserv, obama, sign, allow, detent, citizen]",[agre],"[offici, busi, presid, barack, obama, nation, ...",[abcnew]
2,"[time, squar]","[pretti, confetti, celebr]","[thousand, peopl, flutter, build, descend, tim...",[abcnew]
3,"[romney, dream, presid]",[],"[photo, sioux, romney, explicit, state, today,...",[abcnew]
4,"[pharmaci, shootout, leav, suspect, agent]","[pharmaci, prescript, medic]","[shootout, suburban, famili, pharmaci, leav, s...",[abcnew]


In [None]:
text_df.to_csv('./processed_text.csv', encoding='utf-8')

In [None]:
num_rows = text_df.shape[0]

In [None]:
def get_freq(col):
    word_freq = {}
    for i in range(num_rows):
        dist = nltk.FreqDist(text_df[col].iloc[i])
        for key in dist.keys():
            if key in word_freq.keys():
                word_freq[key] += dist[key]
            else:
                word_freq[key] = dist[key]
    return word_freq

col_word_freq = {}
for col in text_df.columns:
    print(f'The word Frequencies are being computed for the column:{col}')
    col_word_freq[col] = get_freq(col)

The word Frequencies are being computed for the column:name
The word Frequencies are being computed for the column:message
The word Frequencies are being computed for the column:description
The word Frequencies are being computed for the column:caption


In [None]:
import json

with open('word_freq.json', 'w') as f:
    json.dump(col_word_freq,f)

In [None]:
col_word_freq

{'name': {'chief': 912,
  'justic': 623,
  'robert': 316,
  'respond': 471,
  'judici': 5,
  'ethic': 62,
  'critic': 854,
  'reserv': 119,
  'obama': 6175,
  'sign': 122,
  'allow': 505,
  'detent': 93,
  'citizen': 176,
  'time': 1666,
  'squar': 214,
  'romney': 746,
  'dream': 432,
  'presid': 2764,
  'pharmaci': 17,
  'shootout': 118,
  'leav': 1961,
  'suspect': 2397,
  'agent': 267,
  'world': 4287,
  'ring': 41,
  'break': 1375,
  'magnitud': 74,
  'quak': 314,
  'japan': 929,
  'tsunami': 99,
  'warn': 345,
  'buyer': 136,
  'collect': 206,
  'settlement': 192,
  'coast': 636,
  'guard': 426,
  'passeng': 491,
  'limit': 406,
  'reflect': 193,
  'american': 3124,
  'weight': 275,
  'deepli': 44,
  'concern': 488,
  'creat': 520,
  'mutat': 22,
  'bird': 56,
  'again': 509,
  'polic': 5132,
  'georgia': 291,
  'woman': 4144,
  'sever': 393,
  'parent': 1152,
  'arrest': 2112,
  'alleg': 1352,
  'post': 128,
  'photo': 26554,
  'bound': 86,
  'facebook': 1689,
  'person': 664,
 

In [None]:
vocab = []
for key in col_word_freq.keys():
    vocab.extend(list(set(col_word_freq[key].keys())))
vocab = set(vocab)

In [None]:
combined = []
for i in range(text_df.shape[0]):
    temp=[]
    for col in range(len(col_word_freq.keys())):
        temp.extend(text_df.iloc[i,col])
    combined.append(list(set(temp)))

text_df['Unique_tokens'] = combined
l = []
for x in text_df.Unique_tokens:
    l.extend(x)

freq = dict(nltk.FreqDist(l))

sorted_freq = dict(sorted(freq.items(), key=lambda x: x[1], reverse=True))
filtered = {}
# for k,v in freq:
#     if (v>15) and (v<400):
#         print(freq)
temp = np.array(list(sorted_freq.values()))

In [None]:
sorted_freq

{'nbcnew': 39851,
 'nytim': 35078,
 'photo': 30264,
 'latim': 25675,
 'timelin': 22331,
 'peopl': 22087,
 'state': 18939,
 'abcnew': 18291,
 'first': 17716,
 'world': 16791,
 'presid': 16434,
 'polic': 15756,
 'trump': 15424,
 'american': 14014,
 'could': 13698,
 'report': 13688,
 'cbsnew': 13592,
 'would': 13348,
 'donald': 13267,
 'woman': 13004,
 'nation': 12562,
 'offici': 12534,
 'obama': 12088,
 'time': 11999,
 'watch': 11578,
 'shoot': 11320,
 'attack': 10991,
 'today': 10758,
 'video': 10609,
 'think': 10457,
 'countri': 10005,
 'after': 9924,
 'clinton': 9909,
 'leav': 9848,
 'offic': 9664,
 'million': 9612,
 'death': 9473,
 'famili': 9367,
 'stori': 9061,
 'break': 8658,
 'becom': 8578,
 'month': 8568,
 'hous': 8512,
 'hillari': 8385,
 'child': 8242,
 'three': 8202,
 'republican': 8014,
 'chang': 7648,
 'former': 7549,
 'right': 7527,
 'question': 7504,
 'night': 7434,
 'school': 7322,
 'campaign': 7196,
 'presidenti': 7188,
 'still': 7188,
 'includ': 7177,
 'polit': 7106,
 '

In [None]:
quantiles = [0.999,0.99, 0.98, 0.975, 0.95, 0.8, 0.7, 0.6, 0.5 ]
for qunatile in quantiles:
    print(f'Quantile: {qunatile*100} \t Freq: {np.quantile(temp, qunatile)}')

print('---------------------')

Quantile: 99.9 	 Freq: 4485.60400000005
Quantile: 99.0 	 Freq: 618.0
Quantile: 98.0 	 Freq: 236.0
Quantile: 97.5 	 Freq: 167.0
Quantile: 95.0 	 Freq: 51.0
Quantile: 80.0 	 Freq: 3.0
Quantile: 70.0 	 Freq: 1.0
Quantile: 60.0 	 Freq: 1.0
Quantile: 50.0 	 Freq: 1.0
---------------------


The above table shows us the quantile vs word_freq. This helps us remove all the repeating unnecessary words and lets us only extract the words that are useful for us. Here our foal is to only extract the words that have some effects. It should be intuitively understood that words occuring only once in the entire text corpus either have a very unique effect or they don't effect the topic that much. 

Here we are going with the latter case, as the words found tend to mostly be gibberish extracted from the broken links etc.,

### !! Try and include the above table for the report!!

In [None]:
from multiprocessing.sharedctypes import Value

filtered = {}

for key in  sorted_freq.keys():
    if  (sorted_freq[key]>1) and (sorted_freq[key]<=236):
        filtered[key] = sorted_freq[key]

final_list = list(filtered.keys())

filtered_tokens = []
print('Token filtering started.')

i=0
for row in text_df.Unique_tokens:
    i+=1
    temp = []
    for word in row:
        if word in final_list:
            temp.append(word)
    filtered_tokens.append(temp)
    if i%1000 ==0:
        print(f'{i} Rows are processed...')

for i in range(10):
    print(f'The filtered_tokens\t :{filtered_tokens[i]}')
    print(f'The Original\t\t :{text_df.Unique_tokens[i]} \n')

text_df['Unique_tokens'] = filtered_tokens

Token filtering started.
1000 Rows are processed...
2000 Rows are processed...
3000 Rows are processed...
4000 Rows are processed...
5000 Rows are processed...
6000 Rows are processed...
7000 Rows are processed...
8000 Rows are processed...
9000 Rows are processed...
10000 Rows are processed...
11000 Rows are processed...
12000 Rows are processed...
13000 Rows are processed...
14000 Rows are processed...
15000 Rows are processed...
16000 Rows are processed...
17000 Rows are processed...
18000 Rows are processed...
19000 Rows are processed...
20000 Rows are processed...
21000 Rows are processed...
22000 Rows are processed...
23000 Rows are processed...
24000 Rows are processed...
25000 Rows are processed...
26000 Rows are processed...
27000 Rows are processed...
28000 Rows are processed...
29000 Rows are processed...
30000 Rows are processed...
31000 Rows are processed...
32000 Rows are processed...
33000 Rows are processed...
34000 Rows are processed...
35000 Rows are processed...
3600

In [None]:
text_df['Unique_tokens'] = filtered_tokens
texts = list(text_df.Unique_tokens)
id2word = gensim.corpora.Dictionary(text_df.Unique_tokens)
corpus = [id2word.doc2bow(doc) for doc in text_df.Unique_tokens]

In [None]:
for i in range(50):
    print(f'{list(sorted_freq.keys())[i]} : {list(sorted_freq.values())[i]}')

nbcnew : 39851
nytim : 35078
photo : 30264
latim : 25675
timelin : 22331
peopl : 22087
state : 18939
abcnew : 18291
first : 17716
world : 16791
presid : 16434
polic : 15756
trump : 15424
american : 14014
could : 13698
report : 13688
cbsnew : 13592
would : 13348
donald : 13267
woman : 13004
nation : 12562
offici : 12534
obama : 12088
time : 11999
watch : 11578
shoot : 11320
attack : 10991
today : 10758
video : 10609
think : 10457
countri : 10005
after : 9924
clinton : 9909
leav : 9848
offic : 9664
million : 9612
death : 9473
famili : 9367
stori : 9061
break : 8658
becom : 8578
month : 8568
hous : 8512
hillari : 8385
child : 8242
three : 8202
republican : 8014
chang : 7648
former : 7549
right : 7527


In [None]:
texts

[['judici'],
 ['kailua', 'rental', 'sign', 'compon', 'provis'],
 ['confetti', 'flutter'],
 ['criterion', 'explicit', 'sioux'],
 ['pharmaci', 'robber'],
 ['ring'],
 ['tsunami'],
 ['epson'],
 ['implement'],
 ['mutat', 'netherland', 'contagi', 'transmiss', 'manipul'],
 ['bird', 'blackbird'],
 ['swipe'],
 ['upsid', 'post', 'bound'],
 ['colton',
  'survivalist',
  'margaret',
  'tribun',
  'skyway',
  'barn',
  'rainier',
  'ranger'],
 [],
 ['slacker', 'borat'],
 ['disapprov', 'scorn'],
 ['arson'],
 [],
 ['reflux', 'heartburn', 'parallel'],
 ['outreach', 'tarzan', 'chimp', 'sanctuari', 'cheetah', 'primat'],
 ['landslid', 'earnest', 'iowan', 'loser'],
 ['santorum', 'stephanopoulo'],
 ['rapist'],
 ['coronado'],
 ['caucusgo'],
 ['adderal', 'deficit', 'contenti', 'pharmaci'],
 ['crystal', 'prognost', 'irresist'],
 ['santorum'],
 ['entranc', 'santorum'],
 ['razor', 'santorum'],
 ['santorum', 'edg'],
 ['angela', 'clyde', 'bonni', 'atwood', 'logan', 'mcfarland'],
 ['bachmann', 'michel'],
 ['bachma

In [None]:
for i in range(10):
    print(f'Before loop no.{i+1}:{len(texts)}')
    print(texts.count([]))
    for x in texts:
        if x == []:
            texts.remove(x)

    print(f'After loop no.{i+1}:{len(texts)}')

Before loop no.1:286951
37512
After loop no.1:254902
Before loop no.2:254902
5463
After loop no.2:249963
Before loop no.3:249963
524
After loop no.3:249477
Before loop no.4:249477
38
After loop no.4:249442
Before loop no.5:249442
3
After loop no.5:249440
Before loop no.6:249440
1
After loop no.6:249439
Before loop no.7:249439
0
After loop no.7:249439
Before loop no.8:249439
0
After loop no.8:249439
Before loop no.9:249439
0
After loop no.9:249439
Before loop no.10:249439
0
After loop no.10:249439


In [None]:
score=[]

for k in range(4,13): # Train LDA on different values of k
    print('Number of topics: '+str(k))


    ldamodel =gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=k,
                                       random_state=30,
                                       chunksize=100,
                                       passes=20,
                                       per_word_topics=True)
    print(f'The LDA model for {k} topics is generated. Evaluation is taking place...')

    #Calculating the coherence
    cm = gensim.models.coherencemodel.CoherenceModel(model=ldamodel, texts=texts, dictionary=id2word, coherence='c_v').get_coherence()

    print(f'The model for {k} topics is evaluated. The coherence score is {cm}')
    print('\n')
    score.append((k, cm))
    ldamodel.save(f'./models/model_{k}topics')


score = pd.DataFrame(score)

Number of topics: 4
The LDA model for 4 topics is generated. Evaluation is taking place...
The model for 4 topics is evaluated. The coherence score is 0.5372515956628637


Number of topics: 5
The LDA model for 5 topics is generated. Evaluation is taking place...
The model for 5 topics is evaluated. The coherence score is 0.5635226950150024


Number of topics: 6
The LDA model for 6 topics is generated. Evaluation is taking place...
The model for 6 topics is evaluated. The coherence score is 0.5912416543566589


Number of topics: 7
The LDA model for 7 topics is generated. Evaluation is taking place...
The model for 7 topics is evaluated. The coherence score is 0.6061021075778095


Number of topics: 8
The LDA model for 8 topics is generated. Evaluation is taking place...
The model for 8 topics is evaluated. The coherence score is 0.6173206423800738


Number of topics: 9
The LDA model for 9 topics is generated. Evaluation is taking place...
The model for 9 topics is evaluated. The coherenc

KeyboardInterrupt: 

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# score = pd.DataFrame(score)
# score = score.rename(columns={0:'No. of Topics', 1:'Coherence'}).set_index('No. of Topics')

fig = go.Figure()

fig.add_trace(go.Scatter(x = score.index, y = score['Coherence'], mode='lines', name='Coherence'))

fig.show()

NameError: name 'score' is not defined

In [None]:
import pyLDAvis.gensim_models
from gensim import corpora
import gensim

lda_model = gensim.models.ldamodel.LdaModel.load('./models/model_8topics')
dictionary = corpora.Dictionary.load('./models/model_9topics.id2word')
corpus =  [dictionary.doc2bow(text) for text in texts]

pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
LDAvis_prepared

NameError: name 'texts' is not defined

In [None]:

lda_model = gensim.models.ldamodel.LdaModel.load('./models/model_9topics')
dictionary = corpora.Dictionary.load('./models/model_9topics.id2word')
corpus =  [dictionary.doc2bow(text) for text in texts]

pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
LDAvis_prepared

NameError: name 'corpora' is not defined

In [None]:
lda_model = gensim.models.ldamodel.LdaModel.load('./models/model_7topics')
lda_scores = lda_model[corpus]

In [None]:
len(lda_scores)

286951

In [None]:
lda_scores[286950]

([(0, 0.047653154),
  (1, 0.047653154),
  (2, 0.047653154),
  (3, 0.3806372),
  (4, 0.047653154),
  (5, 0.381097),
  (6, 0.047653154)],
 [(616, [3]), (3644, [5])],
 [(616, [(3, 0.99823767)]), (3644, [(5, 0.9995948)])])

In [None]:
scores = [lda_scores[i][0] for i in range(len(lda_scores))]
score_df = pd.DataFrame(scores)
score_df.head()



In [None]:
for col in score_df.columns:
    score_df.iloc[:,col] = score_df.iloc[:,col].apply(lambda x: (0, 0) if x is None else x)

for col in score_df.columns:
    score_df.iloc[:,col] = score_df.iloc[:,col].apply(lambda x: x[1])


In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`



In [149]:
quan_df.posted_at = pd.to_datetime(quan_df.posted_at, )

In [150]:
earliest = quan_df.posted_at.max()
earliest

Timestamp('2016-11-07 23:55:00')

In [152]:
quan_df.posted_at

0        2012-01-01 00:30:26
1        2012-01-01 01:08:58
2        2012-01-01 02:00:37
3        2012-01-01 02:35:20
4        2012-01-01 03:36:01
                 ...        
286954   2016-11-07 22:00:19
286955   2016-11-07 22:35:15
286956   2016-11-07 23:00:16
286957   2016-11-07 23:20:11
286958   2016-11-07 23:40:08
Name: posted_at, Length: 286951, dtype: datetime64[ns]

In [153]:
quan_df.posted_at = quan_df.posted_at.apply(lambda x :  earliest - x)


0         153185074.0
1         153182762.0
2         153179663.0
3         153177580.0
4         153173939.0
             ...     
286954         6881.0
286955         4785.0
286956         3284.0
286957         2089.0
286958          892.0
Name: posted_at, Length: 286951, dtype: float64

In [155]:
quan_df.posted_at = quan_df.posted_at.dt.total_seconds()/(60*60*24)

In [164]:

quan_df = pd.concat([quan_df.drop('status_type', axis=1), pd.get_dummies(quan_df.status_type, drop_first=True, prefix='status_type')], axis=1)
quan_df = pd.concat([quan_df.drop('source', axis=1), pd.get_dummies(quan_df.source, drop_first=True, prefix='source')], axis=1)

In [165]:
quan_df

Unnamed: 0,posted_at,post_type_link,post_type_music,post_type_note,post_type_offer,post_type_photo,post_type_status,post_type_video,status_type_added_video,status_type_created_event,...,status_type_mobile_status_update,status_type_published_story,status_type_shared_story,source_bbc,source_cbs,source_cnn,source_lat,source_nbc,source_nyt,source_wsj
0,1772.975394,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,1772.948634,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,1772.912766,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,1772.888657,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,1772.846516,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
286954,0.079641,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
286955,0.055382,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
286956,0.038009,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
286957,0.024178,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [166]:
target_df

Unnamed: 0,likes_count,comments_count,shares_count,love_count,wow_count,haha_count,sad_count,thankful_count,angry_count
0,61,27,12,0,0,0,0,0,0
1,120,523,171,0,0,0,0,0,0
2,271,31,0,0,0,0,0,0,0
3,140,188,23,0,0,0,0,0,0
4,59,51,34,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
286954,305,168,21,38,2,57,2,0,8
286955,177,109,39,10,5,85,3,0,7
286956,41,21,3,2,1,3,0,0,0
286957,28,36,10,1,3,10,1,0,3


In [175]:
score_df

Unnamed: 0,0,1,2,3,4,5,6
0,0.071438,0.571372,0.071438,0.071438,0.071438,0.071438,0.071438
1,0.023838,0.023845,0.856965,0.023838,0.023838,0.023838,0.023838
2,0.047693,0.047693,0.380586,0.047693,0.047693,0.047693,0.380949
3,0.035722,0.035722,0.035722,0.035722,0.035722,0.785671,0.035722
4,0.047636,0.047914,0.047636,0.047636,0.047685,0.047636,0.713856
...,...,...,...,...,...,...,...
286946,0.047636,0.047636,0.380878,0.047636,0.047636,0.047636,0.380942
286947,0.047636,0.047636,0.047636,0.380965,0.047636,0.380856,0.047636
286948,0.190263,0.023841,0.190622,0.190475,0.023841,0.190311,0.190647
286949,0.285326,0.035745,0.035745,0.285791,0.035745,0.285903,0.035745


In [181]:
input_df = pd.concat([score_df.reset_index(), quan_df.reset_index()], axis=1).drop('index', axis=1)

In [183]:
target_df.dtypes

likes_count       int64
comments_count    int64
shares_count      int64
love_count        int64
wow_count         int64
haha_count        int64
sad_count         int64
thankful_count    int64
angry_count       int64
dtype: object

In [184]:
input_df.dtypes

0                                   float32
1                                   float64
2                                   float64
3                                   float64
4                                   float64
5                                   float64
6                                   float64
posted_at                           float64
post_type_link                        uint8
post_type_music                       uint8
post_type_note                        uint8
post_type_offer                       uint8
post_type_photo                       uint8
post_type_status                      uint8
post_type_video                       uint8
status_type_added_video               uint8
status_type_created_event             uint8
status_type_created_note              uint8
status_type_mobile_status_update      uint8
status_type_published_story           uint8
status_type_shared_story              uint8
source_bbc                            uint8
source_cbs                      

In [185]:
input_df.to_csv('x.csv')
target_df.to_csv('y.csv')
