In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
cols = ['timestamp','tweet_text','username','all_hashtags']

In [3]:
df = pd.read_csv("FacebookDown_instagramdown.csv",header=None, names=cols)

In [4]:
df = df[df['timestamp']!='timestamp']

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 516347 entries, 1 to 516920
Data columns (total 4 columns):
timestamp       516347 non-null object
tweet_text      516347 non-null object
username        516347 non-null object
all_hashtags    516347 non-null object
dtypes: object(4)
memory usage: 19.7+ MB


In [6]:
tweets = df.copy()

In [7]:
tweets.head()

Unnamed: 0,timestamp,tweet_text,username,all_hashtags
1,2019-03-14 16:27:07,"True, true. #Instagram #Instagramdown #Facebo...",bklyn_hayshun,"['Instagram', 'Instagramdown', 'Facebookdown']"
2,2019-03-14 16:26:46,How to prepare yourself for social media outag...,Postfity,"['facebookdown', 'instagramdown']"
3,2019-03-14 16:25:41,When #FacebookDown #facebookoutage #InstagramB...,NikeMC,"['FacebookDown', 'facebookoutage', 'InstagramB..."
4,2019-03-14 16:24:50,#FacebookDown and #instagramdown make me conte...,ftchkn,"['FacebookDown', 'instagramdown', 'crypto', 'b..."
5,2019-03-14 16:24:01,"Facebook, Instagram And Whatsapp Hit By Major ...",DailyMulligan,"['FacebookDown', 'instagramdown', 'whatsappdow..."


In [8]:
tweets.describe()

Unnamed: 0,timestamp,tweet_text,username,all_hashtags
count,516347,516347,516347,516347
unique,5616,6709,5780,2886
top,2019-03-14 14:34:19,"Thank you @Twitter for being you. Yesterday, @...",BestPhotosFilms,"['FacebookDown', 'instagramdown']"
freq,982,980,2944,43800


# Data Preparation

## HTML encoding

In [9]:
pd.options.display.max_colwidth
text_with_html_encoding = tweets[tweets['tweet_text'].str.contains('&amp')]['tweet_text'].iloc[0]
print(text_with_html_encoding)

How to prepare yourself for social media outage? How can you make sure your social media strategy is sustainable? Check out our guest post on GetaNewsletter's blog &amp; find out how to use social media to collect email subscribers https://t.co/yKXmmCwgpA #facebookdown #instagramdown


In [10]:
from bs4 import BeautifulSoup
example1 = BeautifulSoup(text_with_html_encoding, 'lxml')
print(example1.get_text())

How to prepare yourself for social media outage? How can you make sure your social media strategy is sustainable? Check out our guest post on GetaNewsletter's blog & find out how to use social media to collect email subscribers https://t.co/yKXmmCwgpA #facebookdown #instagramdown


## Lower Case

In [11]:
tweets['tweet_text'] = tweets['tweet_text'].astype(str).str.lower()

## '@' mention

##### I noticed that removing the mention of either facebook, instagram or twitter would make me lose sometimes the tweet's meaning. So I chose to remove any other mention. For example:
###### "thank you twitter for being you. yesterday, facebook and twitter were messing"
###### has more sense than 
###### "thank you  for being you. yesterday,  and  were messing"

In [12]:
# The most frequent tweet as I noticed it contained already a '@' from the tweets.describe()
tweets['tweet_text'].mode()[1]

'thank you @twitter for being you. yesterday, @facebook and @instagram were messing with our mojo #facebookdown #instagramdown'

In [13]:
import re
example = tweets['tweet_text'].mode()[1] + '@amine @hosni'
example = re.sub(r'@facebook','facebook',example)
example = re.sub(r'@twitter','twitter',example)
example = re.sub(r'@instagram','twitter',example)
example = re.sub(r'@[A-Za-z0-9]+','',example)
example

'thank you twitter for being you. yesterday, facebook and twitter were messing with our mojo #facebookdown #instagramdown '

## Defining data cleaning function

In [14]:
pattern = '@facebook|@twitter|@instagram|@[a-z0-9]+|https?://[A-Za-z0-9./]+|www.[^ ]+'

In [15]:
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

In [21]:
from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()

def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(pattern, '', souped)
    neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], stripped)
    letters_only = re.sub("[^a-z]", " ", neg_handled)
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = tok.tokenize(letters_only)
    return (" ".join(words)).strip()

testing = tweets['tweet_text'].iloc[:10]
test_result = []
for t in testing:
    test_result.append(tweet_cleaner(t))
test_result[-2]

'andy did not let facebookdown or instagramdown get him down thank you to everyone who donated a total of to build a stronger community rvgives may be over but your support will impact others for a lifetime of restoringhope'

In [22]:
nums = [0,int(len(tweets)/2),len(tweets)]

In [23]:
nums

[0, 258173, 516347]

In [25]:
%%time
print("Cleaning and parsing the tweets...\n")
clean_tweets = []
for i in range(nums[0],nums[1]):
    if( (i+1)%10000 == 0 ):
        print("Tweets %d of %d has been processed" % ( i+1, nums[1] ))                                                                   
    clean_tweets.append(tweet_cleaner(tweets['tweet_text'].iloc[i]))

Cleaning and parsing the tweets...

Tweets 10000 of 258173 has been processed
Tweets 20000 of 258173 has been processed
Tweets 30000 of 258173 has been processed
Tweets 40000 of 258173 has been processed
Tweets 50000 of 258173 has been processed
Tweets 60000 of 258173 has been processed
Tweets 70000 of 258173 has been processed
Tweets 80000 of 258173 has been processed
Tweets 90000 of 258173 has been processed
Tweets 100000 of 258173 has been processed
Tweets 110000 of 258173 has been processed
Tweets 120000 of 258173 has been processed
Tweets 130000 of 258173 has been processed
Tweets 140000 of 258173 has been processed
Tweets 150000 of 258173 has been processed
Tweets 160000 of 258173 has been processed
Tweets 170000 of 258173 has been processed
Tweets 180000 of 258173 has been processed
Tweets 190000 of 258173 has been processed
Tweets 200000 of 258173 has been processed
Tweets 210000 of 258173 has been processed
Tweets 220000 of 258173 has been processed
Tweets 230000 of 258173 has

In [26]:
len(clean_tweets)

258173

In [27]:
%%time
print("Cleaning and parsing the tweets...\n")
for i in range(nums[1],nums[2]):
    if( (i+1)%10000 == 0 ):
        print("Tweets %d of %d has been processed" % ( i+1, nums[2] ))                                                                    
    clean_tweets.append(tweet_cleaner(tweets['tweet_text'].iloc[i]))

Cleaning and parsing the tweets...

Tweets 260000 of 516347 has been processed
Tweets 270000 of 516347 has been processed
Tweets 280000 of 516347 has been processed
Tweets 290000 of 516347 has been processed
Tweets 300000 of 516347 has been processed
Tweets 310000 of 516347 has been processed
Tweets 320000 of 516347 has been processed
Tweets 330000 of 516347 has been processed
Tweets 340000 of 516347 has been processed
Tweets 350000 of 516347 has been processed
Tweets 360000 of 516347 has been processed
Tweets 370000 of 516347 has been processed
Tweets 380000 of 516347 has been processed
Tweets 390000 of 516347 has been processed
Tweets 400000 of 516347 has been processed
Tweets 410000 of 516347 has been processed
Tweets 420000 of 516347 has been processed
Tweets 430000 of 516347 has been processed
Tweets 440000 of 516347 has been processed
Tweets 450000 of 516347 has been processed
Tweets 460000 of 516347 has been processed
Tweets 470000 of 516347 has been processed
Tweets 480000 of 5

In [28]:
len(clean_tweets)

516347

In [29]:
pd.set_option('display.max_colwidth', -1)
clean_df = pd.DataFrame(clean_tweets,columns=['tweet'])
clean_df.head()

Unnamed: 0,tweet
0,true true instagram instagramdown facebookdown
1,how to prepare yourself for social media outage how can you make sure your social media strategy is sustainable check out our guest post on getanewsletter s blog find out how to use social media to collect email subscribers facebookdown instagramdown
2,when facebookdown facebookoutage instagramblackout instagramdown listening to uptight everything s alright by stevie wonder on
3,facebookdown and instagramdown make me contemplate life so much teenagers nowadays spend more than hours each day on social network while they can spend that time educating themselves about crypto blockchain and what project can generate like bcnex bigbom zerobank
4,facebook instagram and whatsapp hit by major outage facebookdown instagramdown whatsappdown worldnews


In [30]:
clean_df.describe()

Unnamed: 0,tweet
count,516347
unique,6521
top,facebookdown and instagramdown so listen to my motivation video
freq,1964


In [31]:
clean_df.drop_duplicates(subset='tweet',keep='first',inplace=True)
clean_df.describe()

Unnamed: 0,tweet
count,6521
unique,6521
top,this facebookdown and instagramdown situation is getting ridiculous i do business of these pages what is really going on
freq,1


In [32]:
clean_df.to_csv('clean_unique_tweets.csv',encoding='utf-8',index=False)

In [37]:
np.sum(clean_df.isnull().any(axis=1))

0

In [None]:
clean_df.isnull().any(axis=0)

In [None]:
tweets.iloc[clean_df[clean_df.isnull().any(axis=1)].index,:].head()

In [None]:
clean_df.dropna(inplace=True)
clean_df.reset_index(drop=True,inplace=True)
clean_df.info()

In [None]:
clean_df.head()

### Different approach to try solving that shifting that occured

In [33]:
%%time
print("Cleaning and parsing the tweets...\n")
ct = pd.DataFrame(columns=cols)
for i in range(nums[0],nums[2]): 
    if( (i+1)%10000 == 0 ):
        print("Tweets %d of %d has been processed" % ( i+1, nums[2] ))                                                                  
    ct = ct.append({'tweet_text': tweet_cleaner(tweets['tweet_text'].iloc[i]), 'timestamp': tweets['timestamp'].iloc[i],
                                       'username': tweets['username'].iloc[i],'all_hashtags': tweets['all_hashtags'].iloc[i]}, ignore_index=True)

Cleaning and parsing the tweets...

Tweets 10000 of 516347 has been processed
Tweets 20000 of 516347 has been processed
Tweets 30000 of 516347 has been processed
Tweets 40000 of 516347 has been processed
Tweets 50000 of 516347 has been processed
Tweets 60000 of 516347 has been processed
Tweets 70000 of 516347 has been processed
Tweets 80000 of 516347 has been processed
Tweets 90000 of 516347 has been processed
Tweets 100000 of 516347 has been processed
Tweets 110000 of 516347 has been processed
Tweets 120000 of 516347 has been processed
Tweets 130000 of 516347 has been processed
Tweets 140000 of 516347 has been processed
Tweets 150000 of 516347 has been processed
Tweets 160000 of 516347 has been processed
Tweets 170000 of 516347 has been processed
Tweets 180000 of 516347 has been processed
Tweets 190000 of 516347 has been processed
Tweets 200000 of 516347 has been processed
Tweets 210000 of 516347 has been processed
Tweets 220000 of 516347 has been processed
Tweets 230000 of 516347 has

In [35]:
ct.to_csv('clean_tweets.csv',encoding='utf-8',index=False)

In [34]:
ct.head()

Unnamed: 0,timestamp,tweet_text,username,all_hashtags
0,2019-03-14 16:27:07,true true instagram instagramdown facebookdown,bklyn_hayshun,"['Instagram', 'Instagramdown', 'Facebookdown']"
1,2019-03-14 16:26:46,how to prepare yourself for social media outage how can you make sure your social media strategy is sustainable check out our guest post on getanewsletter s blog find out how to use social media to collect email subscribers facebookdown instagramdown,Postfity,"['facebookdown', 'instagramdown']"
2,2019-03-14 16:25:41,when facebookdown facebookoutage instagramblackout instagramdown listening to uptight everything s alright by stevie wonder on,NikeMC,"['FacebookDown', 'facebookoutage', 'InstagramBlackout2019', 'instagramdown']"
3,2019-03-14 16:24:50,facebookdown and instagramdown make me contemplate life so much teenagers nowadays spend more than hours each day on social network while they can spend that time educating themselves about crypto blockchain and what project can generate like bcnex bigbom zerobank,ftchkn,"['FacebookDown', 'instagramdown', 'crypto', 'blockchain', 'BCNEX', 'Bigbom', 'Zerobank']"
4,2019-03-14 16:24:01,facebook instagram and whatsapp hit by major outage facebookdown instagramdown whatsappdown worldnews,DailyMulligan,"['FacebookDown', 'instagramdown', 'whatsappdown', 'WorldNews']"


In [38]:
ct.describe()

Unnamed: 0,timestamp,tweet_text,username,all_hashtags
count,516347,516347,516347,516347
unique,5616,6521,5780,2886
top,2019-03-14 14:34:19,facebookdown and instagramdown so listen to my motivation video,BestPhotosFilms,"['FacebookDown', 'instagramdown']"
freq,982,1964,2944,43800


In [39]:
top_10_users = ct['username'].value_counts().head(10)
top_10_users

BestPhotosFilms    2944
ReadScoops         1937
Stacktical         1922
Addlanes           1756
carlarjenkins      1553
CatCaspian         1470
johnsmith0234      1470
gdt_countries      1433
ThemujtabaAli      1344
FoxxOrenji         1064
Name: username, dtype: int64

In [40]:
top_10_tweets = ct['tweet_text'].value_counts().head(10)
top_10_tweets

facebookdown and instagramdown so listen to my motivation video                                                                                                                                                                                          1964
telegram messenger gained million new users within last hours thanks to the facebook instagram and whatsapp global outages facebookisdown facebookdown facebookisdown whatsappdown instagramdown instagramblackout                                       1960
hmmm is it a coincidence that facebook and instagram are down during major brexit vote and that is where the highest outages are facebookdown instagramdown freedomofspeech filteringthenews                                                             1472
telegram messenger gained million new users within last hours thanks to the facebook instagram and whatsapp global outages facebookisdown facebookdown facebookisdown whatsappdown instagramdown instagramblackout instagramisdown            

In [None]:
tweets_polarity = ct.copy()