In [0]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import re
import nltk 
import matplotlib as plot
import seaborn as sns
# nltk.download()

In [0]:
df= pd.read_csv("/content/drive/My Drive/root/test_output", sep='\t')

Filter data and clean data. Followed by NLTK sentence tokenizer. Once data has been tokenized the documents have been left as w list with in a column. I utilized what is known as dataframe explosion to break the list into seperate rows while maintaining the axis (ticker & date)

---



In [0]:
#Filter for desired tickers and timeframe
df = df[df['publish_on'] > '2017-01-01 00:00:00']
df = df[df['primary_ticker'].str.contains('NasdaqGS')]

#Clean data for tokenizing 
df['content'] = df['content'].str.replace('<[^<]+?>', '')
df['content'] = [re.sub("[^A-Za-z0-9,.']", " ", x) for x in df['content']]

#Tokenize data by sentence
df['tokenized'] = df.apply(lambda row: nltk.sent_tokenize(row['content']), axis =1)
df = df[[ 'primary_ticker', 'publish_on','tokenized']] 

#DataFrame Explode
rows= []
_ = df.apply(lambda row: [rows.append([row['primary_ticker'], row['publish_on'], nn]) for nn in row.tokenized], axis=1)
df_new = pd.DataFrame(rows, columns=df.columns).set_index(['primary_ticker', 'publish_on'])

df_new = df_new.reset_index()

Visualize distribution of sentences length by character. 


In [0]:

df_new['text_count'] = df_new['tokenized'].str.len() 
print("mean " + str(df_new['text_count'].mean()))
print("median " + str(df_new['text_count'].median()))

sns.distplot(df_new['text_count'])



Trying to determine a sentence length that can be removed from the dataset sentences like "thank you next question" are unimportant for the anlaysis

In [0]:
q1 = df_new['text_count'].quantile(0.1)
q3 = df_new['text_count'].quantile(.95)
iqr = q3-q1

print(q1)
print(q3)


Femove any undesired tickers due. Also apply a minimum length for sentences

In [0]:
#Apply minimum length of characters 
df2 = df_new[df_new['tokenized'].apply(lambda x: len(x) > q1)]
# df2 = df2[df2['tokenized'].apply(lambda x: len(x) <= q3)]

df2 = df2[df2['primary_ticker'] != 'NasdaqGS:AMSW.A']
df2 = df2[df2['primary_ticker'] != 'NasdaqGS:ASCM.A' ]
df2 = df2[df2['primary_ticker'] != 'NasdaqGS:BELF.B' ]
df2 = df2[df2['primary_ticker'] != 'NasdaqGS:CMCS.A' ]
df2 = df2[df2['primary_ticker'] != 'NasdaqGS:DISC.A' ]
df2 = df2[df2['primary_ticker'] != 'NasdaqGS:GNCM.A' ]
df2 = df2[df2['primary_ticker'] != 'NasdaqGS:IMKT.A' ]
df2 = df2[df2['primary_ticker'] != 'NasdaqGS:KELY.A' ]
df2 = df2[df2['primary_ticker'] != 'NasdaqGS:LBTY.A' ]
df2 = df2[df2['primary_ticker'] != 'NasdaqGS:NRCI.B' ]
df2 = df2[df2['primary_ticker'] != 'NasdaqGS:STRZ.A' ]
df2 = df2[df2['primary_ticker'] != 'NasdaqGS:VLCC.F' ]

df2['publish_on'] = pd.to_datetime(df2['publish_on'])
df2['date']= df2['publish_on'].apply(lambda x:x.date().strftime('%m/%d/%Y'))
df2['exchange'], df2['ticker'] = df2['primary_ticker'].str.split(':', 1).str


Unload CSV in order to save processing time from original text file that is 6GB


In [0]:
from google.colab import files

df2.to_csv('/content/drive/My Drive/root/text_model.csv')
# files.download('.csv')