In [1]:
% matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import pickle

from datetime import datetime, date, time
from textblob import TextBlob
from textblob import Word
from pre_processing import *
from nltk.corpus import stopwords
pd.set_option('display.max_colwidth', -1)

In [2]:
#Load Data Sets
DATA_FOLDER = './data'

tweets_col_names=['text']

tweets_dtypes = {'text': str }

print("Loading Data Set....")
tweets_neg = pd.read_fwf(DATA_FOLDER + '/train_neg.txt',  names=tweets_col_names,
                                  dtype=tweets_dtypes)
tweets_pos = pd.read_fwf(DATA_FOLDER + '/train_pos.txt',  names=tweets_col_names,
                                  dtype=tweets_dtypes)
tweets_neg_full = pd.read_fwf(DATA_FOLDER + '/train_neg_full.txt',  names=tweets_col_names,
                                  dtype=tweets_dtypes)
tweets_pos_full = pd.read_fwf(DATA_FOLDER + '/train_pos_full.txt',  names=tweets_col_names,
                                  dtype=tweets_dtypes)
tweets_test = pd.read_fwf(DATA_FOLDER + '/test_data.txt', header=None)
tweets_test.drop(columns=[1,2], inplace=True)
tweets_test.rename(columns={0:'text'}, inplace= True)

    
print("Data Set loaded !")


#bool variables to clean data
#If set to False the cleaning process will take place
already_cleaned_neg = False
already_cleaned_pos = False
already_cleaned_neg_full = False
already_cleaned_pos_full = False
already_cleaned_test = False

#variables to define which functions to apply 
#to the clean_data function
duplicates = True
emojis = False
punctuaction = False
handle_number = False
special_symbols = True
moreLetters = False
contractions = False
clean_stopwords = False
spelling = False
lemmatize = False


Loading Data Set....
Data Set loaded !


In [3]:
def clean_data(data):
    """Function to clean the data.

    Args:
        data: data to clean
        
    Returns:
        data cleaned
    """
    #increase index to start at 1
    data.index = data.index +1
    
    #remove duplicates
    if duplicates:
        print("removing duplicates!")
        data.drop_duplicates(inplace=True)
        
    if emojis:
        print("emoji")
        data['text'] = data['text'].apply(lambda x: interpret_emoji(x))
    
    if punctuaction:
        print("punctuaction!")
        #remove punctuaction .........
        data['text'] = data['text'].str.replace('.','')
        data['text'] = data['text'].str.replace(',','')
        
    if handle_number:
        print("handling numbers!!")
        #separate number with letters  1234test123 =>  test 
        data['text'] = data['text'].str.replace('[0-9]+',' <number> ')
    
    if special_symbols:
        print("special symbols!!")
        data['text'] = data['text'].str.replace('[();=\\/:*?\|&]+','')
        
    if moreLetters:
        print("more letters!")
        data['text'] = data['text'].apply(lambda x: replace_moreletters(x))
    
    if contractions:
        print("expanding contractions!")
        data['text'] = data['text'].apply(lambda x: expand_contractions(x))

    if clean_stopwords:
        print("removing stop words!")
        data['text'] = data['text'].apply(lambda x: " ".join\
                                          (x for x in x.split() if x not in stop))
    if spelling:
        print("correcting spelling!")
        data['text'] = data['text'].apply(lambda x: ''.join\
                                          (TextBlob(x).correct()))
    if lemmatize:
        print("lemmatizing!")
        w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
        lemmatizer = nltk.stem.WordNetLemmatizer()
        stop = stopwords.words('english')
        def lemmatize_text(text):
            return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]
        
        data['text_lema'] = data['text'].apply(lemmatize_text)
        data['text_lema'] = data['text_lema'].apply(lambda x: ' '.join(x))
        data['text_lema'] = data['text_lema'].str.replace(',','')

        
    print("one space!")
    data['text'] = data['text'].apply(lambda x: one_space(x)) #ok 
    
    
    
    return  data



In [4]:
if not already_cleaned_neg :
    print("Cleaning Tweets Neg")
    tweets_neg.drop_duplicates(inplace=True)
    clean_data(tweets_neg)
    #save the file to pickle
    print("Saving file with preprocessed Tweets")
    pickle_out = open("data/pre_processed/tweets_neg_cleaned.pickle","wb")
    if not lemmatize:
        pickle.dump(tweets_neg['text'], pickle_out)
    else:
        pickle.dump(tweets_neg['text_lema'], pickle_out)
    pickle_out.close()
    #save file to text
    f = open("data/pre_processed/tweets_neg_cleaned.txt", "w", encoding='utf-8')
    if not lemmatize:
        f.write("\n".join(map(lambda x: str(x), tweets_neg['text'])))
    else:
        f.write("\n".join(map(lambda x: str(x), tweets_neg['text_lema'])))
    f.close()
    print("Saved!")
else:
    print("Tweets Pos Already Cleaned!")
pickle_in = open("data/pre_processed/tweets_neg_cleaned.pickle","rb")
tweets_neg_pickle = pickle.load(pickle_in)
print("Opening pickle")
tweets_neg_pickle.head()


Cleaning Tweets Neg
removing duplicates!
special symbols!!
one space!
Saving file with preprocessed Tweets
Saved!
Opening pickle


1    vinco tresorpack 6 difficulty 10 of 10 object disassemble and reassemble the wooden pieces this beautiful wo ... <url>
2    glad i dot have taks tomorrow ! ! #thankful #startho                                                                  
3    1-3 vs celtics in the regular season were fucked if we play them in the playoffs                                      
4    <user> i could actually kill that girl i'm so sorry ! ! !                                                             
5    <user> <user> <user> i find that very hard to believe im afraid                                                       
Name: text, dtype: object

In [5]:
if not already_cleaned_pos :
    print("Cleaning Tweets Pos")
    tweets_pos.drop_duplicates(inplace=True)
    clean_data(tweets_pos)
    #save the file to pickle
    print("Saving file with preprocessed Tweets")
    pickle_out = open("data/pre_processed/tweets_pos_cleaned.pickle","wb")
    if not lemmatize:
        pickle.dump(tweets_pos['text'], pickle_out)
    else:
        pickle.dump(tweets_pos['text_lema'], pickle_out)
    pickle_out.close()
    #save file to text
    f = open("data/pre_processed/tweets_pos_cleaned.txt", "w", encoding='utf-8')
    if not lemmatize:
        f.write("\n".join(map(lambda x: str(x), tweets_pos['text'])))
    else:
        f.write("\n".join(map(lambda x: str(x), tweets_pos['text_lema'])))
    f.close()
    print("Saved!")
else:
    print("Tweets Neg Already Cleaned!")
pickle_in = open("data/pre_processed/tweets_pos_cleaned.pickle","rb")
tweets_pos_pickle = pickle.load(pickle_in)
print("Opening pickle")
tweets_pos_pickle.head()
    

Cleaning Tweets Pos
removing duplicates!
special symbols!!
one space!
Saving file with preprocessed Tweets
Saved!
Opening pickle


1    <user> i dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15
2    because your logic is so dumb , i won't even crop out your name or your photo . tsk . <url>                                    
3    " <user> just put casper in a box ! " looved the battle ! #crakkbitch                                                          
4    <user> <user> thanks sir > > don't trip lil mama ... just keep doin ya thang !                                                 
5    visiting my brother tmr is the bestest birthday gift eveerrr ! ! !                                                             
Name: text, dtype: object

In [6]:
if not already_cleaned_neg_full :
    print("Cleaning Tweets Neg Full")
    tweets_neg_full.drop_duplicates(inplace=True)
    clean_data(tweets_neg_full)
    #save the file to pickle
    print("Saving file with preprocessed Tweets")
    pickle_out = open("data/pre_processed/tweets_neg_full_cleaned.pickle","wb")
    if not lemmatize:
        pickle.dump(tweets_neg_full['text'], pickle_out)
    else:
        pickle.dump(tweets_neg_full['text_lema'], pickle_out)
    pickle_out.close()
    #save file to text
    f = open("data/pre_processed/tweets_neg_full_cleaned.txt", "w", encoding='utf-8')
    if not lemmatize:
        f.write("\n".join(map(lambda x: str(x), tweets_neg_full['text'])))
    else:
        f.write("\n".join(map(lambda x: str(x), tweets_neg_full['text_lema'])))
    f.close()
    print("Saved!")
else:
    print("Tweets Neg Full Already Cleaned!")
pickle_in = open("data/pre_processed/tweets_neg_full_cleaned.pickle","rb")
tweets_neg_full_pickle = pickle.load(pickle_in)
print("Opening pickle")
tweets_neg_full_pickle.head()
    

Cleaning Tweets Neg Full
removing duplicates!
special symbols!!
one space!
Saving file with preprocessed Tweets
Saved!
Opening pickle


1    vinco tresorpack 6 difficulty 10 of 10 object disassemble and reassemble the wooden pieces this beautiful wo ... <url>
2    glad i dot have taks tomorrow ! ! #thankful #startho                                                                  
3    1-3 vs celtics in the regular season were fucked if we play them in the playoffs                                      
4    <user> i could actually kill that girl i'm so sorry ! ! !                                                             
5    <user> <user> <user> i find that very hard to believe im afraid                                                       
Name: text, dtype: object

In [7]:
if not already_cleaned_pos_full :
    print("Cleaning Tweets Pos Full")
    tweets_pos_full.drop_duplicates(inplace=True)
    clean_data(tweets_pos_full)
    #save the file to pickle
    print("Saving file with preprocessed Tweets")
    pickle_out = open("data/pre_processed/tweets_pos_full_cleaned.pickle","wb")
    if not lemmatize:
        pickle.dump(tweets_pos_full['text'], pickle_out)
    else:
        pickle.dump(tweets_pos_full['text_lema'], pickle_out)
    pickle_out.close()
    #save file to text
    f = open("data/pre_processed/tweets_pos_full_cleaned.txt", "w", encoding='utf-8')
    if not lemmatize:
        f.write("\n".join(map(lambda x: str(x), tweets_pos_full['text'])))
    else:
        f.write("\n".join(map(lambda x: str(x), tweets_pos_full['text_lema'])))
    f.close()
    print("Saved!")
else:
    print("Tweets Pos Full Already Cleaned!")
pickle_in = open("data/pre_processed/tweets_pos_full_cleaned.pickle","rb")
tweets_pos_full_pickle = pickle.load(pickle_in)
print("Opening pickle")
tweets_pos_full_pickle.head()
    

Cleaning Tweets Pos Full
removing duplicates!
special symbols!!
one space!
Saving file with preprocessed Tweets
Saved!
Opening pickle


1    <user> i dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15
2    because your logic is so dumb , i won't even crop out your name or your photo . tsk . <url>                                    
3    " <user> just put casper in a box ! " looved the battle ! #crakkbitch                                                          
4    <user> <user> thanks sir > > don't trip lil mama ... just keep doin ya thang !                                                 
5    visiting my brother tmr is the bestest birthday gift eveerrr ! ! !                                                             
Name: text, dtype: object

In [8]:
if not already_cleaned_test :
    tweets_test.index = tweets_test.index +1
    tweets_test['text'] = tweets_test['text'].str.replace('\d+,', '')
    print("Cleaning Test")
    clean_data(tweets_test).head()
    print("Saving file with preprocessed Tweets")
    #save to pickle
    pickle_out = open("data/pre_processed/test_data_cleaned.pickle","wb")
    if not lemmatize:
        pickle.dump(tweets_test['text'], pickle_out)
    else:
        pickle.dump(tweets_test['text_lema'], pickle_out)
    pickle_out.close()
    #save to txt
    f = open("data/pre_processed/test_data_cleaned.txt", "w", encoding='utf-8')
    if not lemmatize:
        f.write("\n".join(map(lambda x: str(x[0]+1) + ',' + str(x[1]), enumerate(tweets_test['text']))))
    else:
        f.write("\n".join(map(lambda x: str(x[0]+1) + ',' + str(x[1]), enumerate(tweets_test['text_lema']))))
    f.close()
    print("Saved!")
else:
    print("Test Already Cleaned!")
pickle_in = open("data/pre_processed/test_data_cleaned.pickle","rb")
tweets_test_pickle = pickle.load(pickle_in)
print("Opening pickle")
tweets_test_pickle.head()
    

Cleaning Test
removing duplicates!
special symbols!!
one space!
Saving file with preprocessed Tweets
Saved!
Opening pickle


2    sea doo pro sea scooter sports with the portable sea-doo seascootersave air , stay longer in the water and ... <url>     
3    <user> shucks well i work all week so now i can't come cheer you on ! oh and put those batteries in your calculator ! ! !
4    i cant stay away from bug thats my baby                                                                                  
5    <user> no ma'am ! ! ! lol im perfectly fine and not contagious anymore lmao                                              
6    whenever i fall asleep watching the tv , i always wake up with a headache                                                
Name: text, dtype: object