In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
cols = ['timestamp','tweet_text','username','all_hashtags']

In [3]:
df = pd.read_csv("FacebookDown_instagramdown.csv",header=None, names=cols)

In [4]:
df = df[df['timestamp']!='timestamp']

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 446745 entries, 1 to 447261
Data columns (total 4 columns):
timestamp       446745 non-null object
tweet_text      446745 non-null object
username        446745 non-null object
all_hashtags    446745 non-null object
dtypes: object(4)
memory usage: 17.0+ MB


In [6]:
df.describe()

Unnamed: 0,timestamp,tweet_text,username,all_hashtags
count,446745,446745,446745,446745
unique,1301,1312,1181,796
top,2019-03-14 14:34:19,"Thank you @Twitter for being you. Yesterday, @...",BestPhotosFilms,"['FacebookDown', 'instagramdown']"
freq,890,888,2668,37176


In [7]:
tweets = df.copy()

In [8]:
tweets.head()

Unnamed: 0,timestamp,tweet_text,username,all_hashtags
1,2019-03-14 16:27:07,"True, true. #Instagram #Instagramdown #Facebo...",bklyn_hayshun,"['Instagram', 'Instagramdown', 'Facebookdown']"
2,2019-03-14 16:26:46,How to prepare yourself for social media outag...,Postfity,"['facebookdown', 'instagramdown']"
3,2019-03-14 16:25:41,When #FacebookDown #facebookoutage #InstagramB...,NikeMC,"['FacebookDown', 'facebookoutage', 'InstagramB..."
4,2019-03-14 16:24:50,#FacebookDown and #instagramdown make me conte...,ftchkn,"['FacebookDown', 'instagramdown', 'crypto', 'b..."
5,2019-03-14 16:24:01,"Facebook, Instagram And Whatsapp Hit By Major ...",DailyMulligan,"['FacebookDown', 'instagramdown', 'whatsappdow..."


In [9]:
tweets.describe()

Unnamed: 0,timestamp,tweet_text,username,all_hashtags
count,446745,446745,446745,446745
unique,1301,1312,1181,796
top,2019-03-14 14:34:19,"Thank you @Twitter for being you. Yesterday, @...",BestPhotosFilms,"['FacebookDown', 'instagramdown']"
freq,890,888,2668,37176


# Data Preparation

## HTML encoding

In [10]:
pd.options.display.max_colwidth
text_with_html_encoding = tweets[tweets['tweet_text'].str.contains('&amp')]['tweet_text'].iloc[0]
print(text_with_html_encoding)

How to prepare yourself for social media outage? How can you make sure your social media strategy is sustainable? Check out our guest post on GetaNewsletter's blog &amp; find out how to use social media to collect email subscribers https://t.co/yKXmmCwgpA #facebookdown #instagramdown


In [11]:
from bs4 import BeautifulSoup
example1 = BeautifulSoup(text_with_html_encoding, 'lxml')
print(example1.get_text())

How to prepare yourself for social media outage? How can you make sure your social media strategy is sustainable? Check out our guest post on GetaNewsletter's blog & find out how to use social media to collect email subscribers https://t.co/yKXmmCwgpA #facebookdown #instagramdown


## Lower Case

In [12]:
tweets['tweet_text'] = tweets['tweet_text'].astype(str).str.lower()

## '@' mention

##### I noticed that removing the mention of either facebook, instagram or twitter would make me lose sometimes the tweet's meaning. So I chose to remove any other mention. For example:
###### "thank you twitter for being you. yesterday, facebook and twitter were messing"
###### has more sense than 
###### "thank you  for being you. yesterday,  and  were messing"

In [13]:
# The most frequent tweet as I noticed it contained already a '@' from the tweets.describe()
tweets['tweet_text'].mode()[1]

'thank you @twitter for being you. yesterday, @facebook and @instagram were messing with our mojo #facebookdown #instagramdown'

In [14]:
import re
example = tweets['tweet_text'].mode()[1] + '@amine @hosni'
example = re.sub(r'@facebook','facebook',example)
example = re.sub(r'@twitter','twitter',example)
example = re.sub(r'@instagram','twitter',example)
example = re.sub(r'@[A-Za-z0-9]+','',example)
example

'thank you twitter for being you. yesterday, facebook and twitter were messing with our mojo #facebookdown #instagramdown '

## Defining data cleaning function

In [15]:
pattern = '@facebook|@twitter|@instagram|@[a-z0-9]+|https?://[A-Za-z0-9./]+'

In [16]:
from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()

def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(pattern, '', souped)
    letters_only = re.sub("[^a-z]", " ", stripped)
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = tok.tokenize(letters_only)
    return (" ".join(words)).strip()

testing = tweets['tweet_text'].iloc[:100]
test_result = []
for t in testing:
    test_result.append(tweet_cleaner(t))
test_result

['true true instagram instagramdown facebookdown',
 'how to prepare yourself for social media outage how can you make sure your social media strategy is sustainable check out our guest post on getanewsletter s blog find out how to use social media to collect email subscribers facebookdown instagramdown',
 'when facebookdown facebookoutage instagramblackout instagramdown listening to uptight everything s alright by stevie wonder on',
 'facebookdown and instagramdown make me contemplate life so much teenagers nowadays spend more than hours each day on social network while they can spend that time educating themselves about crypto blockchain and what project can generate like bcnex bigbom zerobank',
 'facebook instagram and whatsapp hit by major outage facebookdown instagramdown whatsappdown worldnews',
 'the blessings of the social blackout for our brain peace focus brain addiction hypothalamus priorities resilience facebookdown instagramdown',
 'checkout this coversong souhilabenlachhab

In [17]:
nums = [0,223372,446745]

In [187]:
%%time
print("Cleaning and parsing the tweets...\n")
clean_tweets = []
for i in range(nums[0],nums[1]):
    if( (i+1)%10000 == 0 ):
        print("Tweets %d of %d has been processed" % ( i+1, nums[1] ))                                                                   
    clean_tweets.append(tweet_cleaner(tweets['tweet_text'].iloc[i]))

Cleaning and parsing the tweets...

Tweets 10000 of 217285 has been processed
Tweets 20000 of 217285 has been processed
Tweets 30000 of 217285 has been processed
Tweets 40000 of 217285 has been processed
Tweets 50000 of 217285 has been processed
Tweets 60000 of 217285 has been processed
Tweets 70000 of 217285 has been processed
Tweets 80000 of 217285 has been processed
Tweets 90000 of 217285 has been processed
Tweets 100000 of 217285 has been processed
Tweets 110000 of 217285 has been processed
Tweets 120000 of 217285 has been processed
Tweets 130000 of 217285 has been processed
Tweets 140000 of 217285 has been processed
Tweets 150000 of 217285 has been processed
Tweets 160000 of 217285 has been processed
Tweets 170000 of 217285 has been processed
Tweets 180000 of 217285 has been processed
Tweets 190000 of 217285 has been processed
Tweets 200000 of 217285 has been processed
Tweets 210000 of 217285 has been processed
Wall time: 1min 2s


In [188]:
len(clean_tweets)

217285

In [189]:
%%time
print("Cleaning and parsing the tweets...\n")
for i in range(nums[1],nums[2]):
    if( (i+1)%10000 == 0 ):
        print("Tweets %d of %d has been processed" % ( i+1, nums[2] ))                                                                    
    clean_tweets.append(tweet_cleaner(tweets['tweet_text'].iloc[i]))

Cleaning and parsing the tweets...

Tweets 220000 of 434571 has been processed
Tweets 230000 of 434571 has been processed
Tweets 240000 of 434571 has been processed
Tweets 250000 of 434571 has been processed
Tweets 260000 of 434571 has been processed
Tweets 270000 of 434571 has been processed
Tweets 280000 of 434571 has been processed
Tweets 290000 of 434571 has been processed
Tweets 300000 of 434571 has been processed
Tweets 310000 of 434571 has been processed
Tweets 320000 of 434571 has been processed
Tweets 330000 of 434571 has been processed
Tweets 340000 of 434571 has been processed
Tweets 350000 of 434571 has been processed
Tweets 360000 of 434571 has been processed
Tweets 370000 of 434571 has been processed
Tweets 380000 of 434571 has been processed
Tweets 390000 of 434571 has been processed
Tweets 400000 of 434571 has been processed
Tweets 410000 of 434571 has been processed
Tweets 420000 of 434571 has been processed
Tweets 430000 of 434571 has been processed
Wall time: 1min 5s

In [190]:
len(clean_tweets)

434571

In [242]:
clean_df = pd.DataFrame(clean_tweets,columns=['tweet'])
clean_df['timestamp'] = df['timestamp']
clean_df['username'] = tweets['username']
clean_df['all_hashtags'] = tweets['all_hashtags']
clean_df.head()

Unnamed: 0,tweet,timestamp,username,all_hashtags
0,true true instagram instagramdown facebookdown,,,
1,how to prepare yourself for social media outage how can you make sure your social media strategy is sustainable check out our guest post on getanewsletter s blog find out how to use social media to collect email subscribers facebookdown instagramdown,2019-03-14 16:27:07,bklyn_hayshun,"['Instagram', 'Instagramdown', 'Facebookdown']"
2,when facebookdown facebookoutage instagramblackout instagramdown listening to uptight everything s alright by stevie wonder on,2019-03-14 16:26:46,Postfity,"['facebookdown', 'instagramdown']"
3,facebookdown and instagramdown make me contemplate life so much teenagers nowadays spend more than hours each day on social network while they can spend that time educating themselves about crypto blockchain and what project can generate like bcnex bigbom zerobank,2019-03-14 16:25:41,NikeMC,"['FacebookDown', 'facebookoutage', 'InstagramBlackout2019', 'instagramdown']"
4,facebook instagram and whatsapp hit by major outage facebookdown instagramdown whatsappdown worldnews,2019-03-14 16:24:50,ftchkn,"['FacebookDown', 'instagramdown', 'crypto', 'blockchain', 'BCNEX', 'Bigbom', 'Zerobank']"


In [231]:
clean_df.describe()

Unnamed: 0,tweet
count,434571
unique,1255
top,facebookdown and instagramdown so listen to my motivation video
freq,1732


In [236]:
np.sum(clean_df.isnull().any(axis=1))

0

In [241]:
clean_df.isnull().any(axis=0)

tweet    False
dtype: bool

In [238]:
tweets.iloc[clean_df[clean_df.isnull().any(axis=1)].index,:].head()

Unnamed: 0,timestamp,tweet_text,username,all_hashtags


In [239]:
clean_df.dropna(inplace=True)
clean_df.reset_index(drop=True,inplace=True)
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 434571 entries, 0 to 434570
Data columns (total 1 columns):
tweet    434571 non-null object
dtypes: object(1)
memory usage: 3.3+ MB


In [240]:
clean_df.head()

Unnamed: 0,tweet
0,true true instagram instagramdown facebookdown
1,how to prepare yourself for social media outage how can you make sure your social media strategy is sustainable check out our guest post on getanewsletter s blog find out how to use social media to collect email subscribers facebookdown instagramdown
2,when facebookdown facebookoutage instagramblackout instagramdown listening to uptight everything s alright by stevie wonder on
3,facebookdown and instagramdown make me contemplate life so much teenagers nowadays spend more than hours each day on social network while they can spend that time educating themselves about crypto blockchain and what project can generate like bcnex bigbom zerobank
4,facebook instagram and whatsapp hit by major outage facebookdown instagramdown whatsappdown worldnews


### Different approach to try solving that shifting that occured

In [None]:
%%time
print("Cleaning and parsing the tweets...\n")
ct = pd.DataFrame(columns=cols)
for i in range(nums[0],nums[1]):
    if( (i+1)%10000 == 0 ):
        print("Tweets %d of %d has been processed" % ( i+1, nums[1] ))                                                                   
    ct.append({'tweet_text': tweet_cleaner(tweets['tweet_text'].iloc[i]), 'timestamp': tweets['timestamp'],
                                       'username': tweets['username'],'all_hashtags': tweets['all_hashtags']}, ignore_index=True)

Cleaning and parsing the tweets...

Tweets 10000 of 223372 has been processed
Tweets 20000 of 223372 has been processed
Tweets 30000 of 223372 has been processed
