In [None]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
cols = ['timestamp','tweet_text','username','all_hashtags']

In [None]:
df = pd.read_csv("FacebookDown_instagramdown.csv",header=None, names=cols)

In [None]:
df = df[df['timestamp']!='timestamp']

In [None]:
df.info()

In [None]:
tweets = df.copy()

In [None]:
tweets.head()

In [None]:
tweets.describe()

# Data Preparation

## HTML encoding

In [None]:
pd.options.display.max_colwidth
text_with_html_encoding = tweets[tweets['tweet_text'].str.contains('&amp')]['tweet_text'].iloc[0]
print(text_with_html_encoding)

In [None]:
from bs4 import BeautifulSoup
example1 = BeautifulSoup(text_with_html_encoding, 'lxml')
print(example1.get_text())

## Lower Case

In [None]:
tweets['tweet_text'] = tweets['tweet_text'].astype(str).str.lower()

## '@' mention

##### I noticed that removing the mention of either facebook, instagram or twitter would make me lose sometimes the tweet's meaning. So I chose to remove any other mention. For example:
###### "thank you twitter for being you. yesterday, facebook and twitter were messing"
###### has more sense than 
###### "thank you  for being you. yesterday,  and  were messing"

In [None]:
# The most frequent tweet as I noticed it contained already a '@' from the tweets.describe()
tweets['tweet_text'].mode()[1]

In [None]:
import re
example = tweets['tweet_text'].mode()[1] + '@amine @hosni'
example = re.sub(r'@facebook','facebook',example)
example = re.sub(r'@twitter','twitter',example)
example = re.sub(r'@instagram','twitter',example)
example = re.sub(r'@[A-Za-z0-9]+','',example)
example

## Defining data cleaning function

In [None]:
pattern = '@facebook|@twitter|@instagram|@[a-z0-9]+|https?://[A-Za-z0-9./]+|www.[^ ]+'

In [None]:
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

In [None]:
from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()

def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(pattern, '', souped)
    neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], stripped)
    letters_only = re.sub("[^a-z]", " ", neg_handled)
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = tok.tokenize(letters_only)
    return (" ".join(words)).strip()

testing = tweets['tweet_text'].iloc[:10]
test_result = []
for t in testing:
    test_result.append(tweet_cleaner(t))
test_result[-2]

In [None]:
nums = [0,int(len(tweets)/2),len(tweets)]

In [None]:
nums

In [None]:
%%time
print("Cleaning and parsing the tweets...\n")
clean_tweets = []
for i in range(nums[0],nums[1]):
    if( (i+1)%10000 == 0 ):
        print("Tweets %d of %d has been processed" % ( i+1, nums[1] ))                                                                   
    clean_tweets.append(tweet_cleaner(tweets['tweet_text'].iloc[i]))

In [None]:
len(clean_tweets)

In [None]:
%%time
print("Cleaning and parsing the tweets...\n")
for i in range(nums[1],nums[2]):
    if( (i+1)%10000 == 0 ):
        print("Tweets %d of %d has been processed" % ( i+1, nums[2] ))                                                                    
    clean_tweets.append(tweet_cleaner(tweets['tweet_text'].iloc[i]))

In [None]:
len(clean_tweets)

In [None]:
pd.set_option('display.max_colwidth', -1)
clean_df = pd.DataFrame(clean_tweets,columns=['tweet'])
clean_df.head()

In [None]:
clean_df.describe()

In [None]:
clean_df.drop_duplicates(subset='tweet',keep='first',inplace=True)
clean_df.describe()

In [None]:
clean_df.to_csv('clean_unique_tweets.csv',encoding='utf-8',index=False)

### Getting all columns, not only the tweets'text

In [None]:
nums = [0,int(len(tweets)/4),int(len(tweets)/2),int(len(tweets)*3/4),len(tweets)]
nums

In [None]:
%%time
print("Cleaning and parsing the tweets...\n")
ct = pd.DataFrame(columns=cols)
for i in range(nums[0],nums[4]): 
    if( (i+1)%10000 == 0 ):
        print("Tweets %d of %d has been processed" % ( i+1, nums[4] ))                                                                  
    ct = ct.append({'tweet_text': tweet_cleaner(tweets['tweet_text'].iloc[i]), 'timestamp': tweets['timestamp'].iloc[i],
                                       'username': tweets['username'].iloc[i]}, ignore_index=True)

In [None]:
ct.to_csv('clean_tweets.csv',encoding='utf-8',index=False)

In [None]:
ct.head()

In [None]:
ct.describe()

In [None]:
top_10_users = ct['username'].value_counts().head(10)
top_10_users

In [None]:
top_10_tweets = ct['tweet_text'].value_counts().head(10)
top_10_tweets

In [None]:
tweets_polarity = ct.copy()