# Task 2: Text Pre-Processing (%45)

In [1]:
import pandas as pd
import langid
from bs4 import BeautifulSoup as bsoup
import re
import os
import nltk
from nltk.collocations import *
from itertools import chain
import itertools
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import MWETokenizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.probability import FreqDist



## Firstly, i decided to read the excel file into a pandas dataframe using the ExcelFile pandas functions. I was able to achieve this by:

#### 1. Iterating through all the sheet names and parsing the sheets.
#### 2. dropping all columns with nulls for each sheet while appending and drop all rows with nulls at the same time
#### 3. used pd.concat to merge all the dataframes, removed repeated text, id, created at columns
#### 4. renamed the columns
#### 5. reset row index

In [2]:
# reading excel file

data = pd.ExcelFile('sample.xlsx')
df = []
for sheet in data.sheet_names:
    df.append(data.parse(sheet)
                         .dropna(axis = 0, how ='all')
                         .dropna(axis = 1, how ='all').T.reset_index(drop=True).T)
    
df = pd.concat(df)

df = df[df[0] != 'text']
print(len(df))    
    
df.columns = ['text', 'id', 'created_at']


#reset dataframe index

df = df.reset_index(drop=True)

df.head()


16000


Unnamed: 0,text,id,created_at
0,#Cientificos #Delincuentes https://t.co/qMuDl8...,1241764966912451072,2020-03-22T16:33:33.000Z
1,Packed UK concerts amid rising Covid-19 cases ...,1241689884911383040,2020-03-22T11:35:12.000Z
2,#DiputadasQuerétaro QUERÉTARO REFERENTE A NI...,1241744180352877056,2020-03-22T15:10:57.000Z
3,"QUE PUTAS YA SON TANTOS, PARCEEEE??? O SEA EN ...",1241595765904080896,2020-03-22T05:21:12.000Z
4,en misiones no hay casos porque acá tenemos al...,1241752132455476992,2020-03-22T15:42:33.000Z


In [3]:
langid.classify(str(df['text'][0]))

('es', -25.12564468383789)

## Using langid.classify function to remove rows in my dataframe which have text not in english

In [4]:
#classifying the dataframe using langid, keeping only the english tweets

#remove non english tweets from the dataframe
for i in range(len(df['text'])):
    if langid.classify(str(df['text'][i]))[0] != 'en':
        df = df.drop([i])
        
print(len(df))    

9035


In [5]:
#resetting index
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,text,id,created_at
0,Packed UK concerts amid rising Covid-19 cases ...,1241689884911383040,2020-03-22T11:35:12.000Z
1,@derek_adesso Hover over any location in the w...,1241577460895863040,2020-03-22T04:08:28.000Z
2,https://t.co/AYPd7erCiB,1241586654541824000,2020-03-22T04:45:00.000Z
3,@realDonaldTrump Asshole it's called covid-19,1241583861739311104,2020-03-22T04:33:54.000Z
4,"Fiddling While Rome Burns, The Reboot https://...",1241652623004099072,2020-03-22T09:07:08.000Z


## Removing time from created at column for creating of a key value pair dictionary with dates as keys, and text as values.

In [6]:
#removing time from created at

df[['created_at','extra_col']] = df['created_at'].str.split('T',expand=True)
df = df.drop(columns = ['extra_col'])
df.head()

Unnamed: 0,text,id,created_at
0,Packed UK concerts amid rising Covid-19 cases ...,1241689884911383040,2020-03-22
1,@derek_adesso Hover over any location in the w...,1241577460895863040,2020-03-22
2,https://t.co/AYPd7erCiB,1241586654541824000,2020-03-22
3,@realDonaldTrump Asshole it's called covid-19,1241583861739311104,2020-03-22
4,"Fiddling While Rome Burns, The Reboot https://...",1241652623004099072,2020-03-22


## Creating a dictionary with dates as key and text and values

In [7]:
newdf = df

#creating a dictionary with dates as key and text and values
dict1 = newdf.groupby('created_at')['text'].apply(list).to_dict()


print(len(dict1))

8


In [8]:
len(dict1['2020-03-22'])

1093

In [9]:
#checking if dictionary is created well

k = 0

for i in dict1.keys():
    k+=len(dict1[i])
    
print(k)

9035


## Tokenizing the text in the dictionary and then adding the tokenized words into a list 

In [10]:
# tokenizing the words

tokenized_words = []

for items in list(dict1.keys()):
    
    tokenizer = RegexpTokenizer(r"[a-zA-Z]+(?:[-'][a-zA-Z]+)?")
    tokens = tokenizer.tokenize(str(dict1[items]).lower()) # converting all the tokens to lower case
    tokenized_words += tokens

len(tokenized_words)

220737

In [11]:
tokenized_words[0:20]

['packed',
 'uk',
 'concerts',
 'amid',
 'rising',
 'covid',
 'cases',
 'shock',
 'social',
 'media',
 'free',
 'malaysia',
 'today',
 'https',
 't',
 'co',
 'r',
 'gu',
 'wj',
 'qy']

## Finding top 200 meaningful bigrams using pmi

In [12]:
#finding top 200 meaningful bigrams using pmi
bigram_measures = nltk.collocations.BigramAssocMeasures()

finder = nltk.collocations.BigramCollocationFinder.from_words(tokenized_words)

finder.apply_freq_filter(20)
finder.apply_word_filter(lambda w: len(w) < 3)

meaningful_bigrams = finder.nbest(bigram_measures.pmi, 200)

# # creating bigram words to add to vocab
# pmi_bigrams = []
# for word in meaningful_bigrams:
#     pmi_bigrams.append('_'.join(word))
    
    
# pmi_bigrams




In [13]:
#mwetokenizer

kkdict = {}

for key in dict1.keys():
    tokenizer = RegexpTokenizer(r"[a-zA-Z]+(?:[-'][a-zA-Z]+)?")
    tokens = tokenizer.tokenize(str(dict1[key]).lower())  #.lower()
    kkdict[key] = tokens




# mwetokenizer = MWETokenizer(meaningful_bigrams)
# colloc_patents =  dict((pid, mwetokenizer.tokenize(patent)) for pid,patent in kkdict.items())
# all_words_colloc = list(chain.from_iterable(colloc_patents.values()))
# colloc_voc = list(set(all_words_colloc))
# print(len(colloc_voc))




# pids = []
# patent_words = []
# for pid, tokens in colloc_patents.items():
#     pids.append(pid)
#     txt = ' '.join(tokens)
#     patent_words.append(txt)
    
    
    
    
    
# from sklearn.feature_extraction.text import TfidfVectorizer


# tfidf_vectorizer = TfidfVectorizer(input = 'content', analyzer = 'word')
# tfidf_vectors = tfidf_vectorizer.fit_transform(patent_words)
# tfidf_vectors.shape

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word") 

data_features = vectorizer.fit_transform([' '.join(value) for value in kkdict.values()])
print (data_features.shape)



# vocab2 = vectorizer.get_feature_names()

# for word, count in zip(vocab2, data_features.toarray()[0]):
#     if count > 0:
#         print (word, ":", count)



(8, 32027)


In [14]:
save_file = open("vectorise_test.txt", 'w')



vocab = vectorizer.get_feature_names()

cx = data_features.tocoo()

#for each dates
for k in pids:
    save_file.write(str(k) + ',')
    
    for i,j,v in list(itertools.zip_longest(cx.row, cx.col, cx.data)): 
        if i == pids.index(k):
            save_file.write(str(j) + ':' + str(v) + ',')
    # removing the extra comma after each iteration
    save_file.seek(0, 2)              
    save_file.seek(save_file.tell() - 1, 0)  
    save_file.truncate()
    save_file.write('\n') # new line for the next date
    
    
save_file.close()

NameError: name 'pids' is not defined

In [None]:
len(pmi_bigrams)

## Removing stopwords from vocab

In [None]:
#removing stopwords from vocab
        
with open("stopwords_en.txt") as stop_words:
    stopwords = stop_words.read().splitlines()

In [None]:
stopwords[0:6]

In [None]:
#filtering the tokens to remove stopwords

filtered_tokenized_words = [token for token in tokenized_words if token not in stopwords]

len(filtered_tokenized_words)

## Removing tokens with the length less than 3 

In [None]:
# removing tokens with the length less than 3 

len_tokenized_words = [token for token in filtered_tokenized_words if len(token)>=3]

len(len_tokenized_words)

## Remove context-dependent (with the threshold set to more than 60 days) stop words



#### Here i had to tokenized once again, so that i have a dictionary with dates and tokenized words for each date

In [None]:
#context-dependent (with the threshold set to more than 60 days) stop words must be removed from the vocab


#tokenizing by date

new_dict = {}

for key in dict1.keys():
    tokenizer = RegexpTokenizer(r"[a-zA-Z]+(?:[-'][a-zA-Z]+)?")
    tokens = tokenizer.tokenize(str(dict1[key]).lower())  #.lower()
    new_dict[key] = tokens
    



In [None]:
len(new_dict)

In [None]:
#checking if tokenizing worked

k=0
for i in new_dict.keys():
    k+=len(new_dict[i])
    


k

## Setting up a dictionary to have only unique tokens for each date

In [None]:
unique_dict = {}

for key in new_dict.keys():
    unique = list(set(new_dict[key]))
    unique_dict[key] = unique
    
    

In [None]:
#checking if unique dictionary worked

k=0
for i in unique_dict.keys():
    k+=len(unique_dict[i])
    


k

## Putting all the unique token words from each day into a list for the calculation of value counts, which will allow me to account for words which appear in more than 60 days or less than 5 days.

In [None]:

# putting all the unique words from each day into a list for the calculation of value counts, which will allow me to 
# account of words which appear in more than 60 days and less than 5 days.




# this list has repitions of all unique words in the 81 days
unique_list = []

for key in unique_dict.keys():
    unique_list += unique_dict[key]
    
    
len(unique_list)

## Using the Counter package to make key value pair of unique tokens and their value counts.

## Here value counts refer to frequency of days the word appear out of 81 days.

In [None]:
from collections import Counter

counts = Counter(unique_list)
len(counts)

In [None]:
counts_dict = dict(counts)
len(counts_dict)

In [None]:
import operator
for k,v in sorted(counts_dict.items(), key=operator.itemgetter(1), reverse=True)[:5]:
    print ('key:'+ str(k) + '|','value:' + str(v))

## These are the context dependant words in a list

In [None]:
counts_dict = dict(counts)


# these are the context dependant words in a list

context_dep = []

for key in counts_dict.keys():
    if counts_dict[key]>60:
        context_dep.append(key)
        
  






In [None]:
context_dep[0:5]

## These are rare token words in a list

In [None]:
# these are rare token words in a list

rare_words = []

for key in counts_dict.keys():
    if counts_dict[key]<5:
        rare_words.append(key)

In [None]:
rare_words[0:5]

In [None]:
len(rare_words)

In [None]:
len(context_dep)

## This is a list of all the unique words in all the days

In [None]:
# list of all the unique words in all days

unique_words = set(unique_list)
unique_words

In [None]:
len(unique_words)

## Continuing after removing tokens with the length less than 3 step filtering the tokens to remove  context-dependent stopwords

In [None]:
# continuing after removing tokens with the length less than 3 step

#filtering the tokens to remove  context-dependent stopwords

unique_words = [token for token in unique_words if token not in context_dep]

len(unique_words)

## Filtering the tokens to remove  rare tokens

In [None]:
#filtering the tokens to remove  rare tokens

unique_words = set(unique_words) - set(rare_words)
unique_words = list(unique_words)

len(unique_words)

## Removing stopwords from vocab

In [None]:
#removing stopwords from vocab
        
with open("stopwords_en.txt") as stop_words:
    stopwords = stop_words.read().splitlines()
    
#filtering the tokens to remove stopwords

unique_words = [token for token in unique_words if token not in stopwords]





len(unique_words)

## Removing tokens with the length less than 3 

In [None]:
# removing tokens with the length less than 3 

unique_words = [token for token in unique_words if len(token)>=3]
unique_words = list(unique_words)
len(unique_words)


unique_words

In [None]:
unique_words
len(unique_words)

In [None]:
unique_words

## Stemming tokens using  Porter stemmer

In [None]:
# stemming tokens using  Porter stemmer

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
unique_words = ['{1}'.format(u, stemmer.stem(u)) for u in unique_words]
unique_words = set(unique_words)
unique_words = list(unique_words)
unique_words.sort()
unique_words

In [None]:
len(unique_words)

## Adding the pmi_bigrams to the unique words list from above

In [None]:
vocab_list = unique_words+pmi_bigrams
sorted_vocab = sorted(vocab_list)
sorted_vocab 

In [None]:
len(sorted_vocab)

## Turning sorted vocab list into a dictionary with index as value

In [None]:
vocab_dict = {k: v for v, k in enumerate(sorted_vocab)}

In [None]:
print(vocab_dict)

## Outputting vocab into a file

In [None]:
#outputting vocab into a file


out_file = open("./31043313_vocabgggg.txt", 'w')

for i in vocab_dict.keys():
    out_file.write(str(i) + ':' + str(vocab_dict[i]) + '\n')
out_file.close()

## Unigram - workings

In [None]:
#workings for the unigram section

from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.probability import FreqDist


#tokenizing by date

zdict = {}

for key in dict1.keys():
    tokenizer = RegexpTokenizer(r"[a-zA-Z]+(?:[-'][a-zA-Z]+)?")
    tokens = tokenizer.tokenize(str(dict1[key]).lower())  #.lower()
    zdict[key] = tokens




    
#removing context independant stopwords  

for date in zdict.keys():
    zdict[date]=[token for token in zdict[date] if token not in stopwords]


#stemming
for date in zdict.keys():
    stem_words = ['{1}'.format(u, stemmer.stem(u)) for u in zdict[date]]
    
    stem_words = list(stem_words)
    stem_words.sort()
    zdict[date] = stem_words
    
#removing words lenth less than 3  
for i in zdict.keys():
    zdict[i]=[token for token in zdict[i] if len(token)>=3]
    
    
# top 100 most common unigrams    
for i in zdict.keys():
    zdict[i] = FreqDist(zdict[i]).most_common(100)      

In [None]:
zdict['2020-03-22']

### Outputting 100uni into a file

In [None]:
#outputting 100uni into a file


out_file = open("./31043313_100unigggg.txt", 'w')
for d in zdict.keys():
    out_file.write(''.join(str(zdict[d])).replace('[',(d + ':[')) + '\n')
out_file.close()

## Bigram - workings

In [None]:
#workings for bigram section
from nltk.util import ngrams

#tokenizing by date

bdict = {}

for key in dict1.keys():
    tokenizer = RegexpTokenizer(r"[a-zA-Z]+(?:[-'][a-zA-Z]+)?")
    tokens = tokenizer.tokenize(str(dict1[key]).lower())  #.lower()
    bdict[key] = tokens


# top 100 most common bigrams
for i in bdict.keys():
    s = ngrams(bdict[i], n = 2)
    bdict[i] = FreqDist(s).most_common(100)




In [None]:
bdict['2020-03-24']

### Outputting 100bi into a file

In [None]:
#outputting 100bi into a file


out_file = open("./31043313_100bigggg.txt", 'w')
for d in bdict.keys():
    out_file.write(''.join(str(bdict[d])).replace('[',(d + ':[')) + '\n')
out_file.close()

In [None]:
#creating sparse matrix

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word") 






