# Analyzing tweet content

# Import

In [32]:
import pandas as pd
from os import listdir
from os.path import isfile, join
import numpy as np
import itertools
from collections import Counter
import warnings
import re
from IPython.display import clear_output
from IPython.core.display import display, HTML

In [7]:
warnings.simplefilter('ignore')

# Data

In [21]:
AllTweets = pd.read_csv("Data/AllTweets.csv", index_col=0)

In [22]:
AllTweets.head()

Unnamed: 0,index,date,favorites,geo,hashtags,id,mentions,permalink,retweets,text,username
0,0,2020-03-02 00:59,1,,,1.2342670258093302e+18,,https://twitter.com/kevindienda/status/1234267...,0,sommige van jullie willen interessant doen met...,
1,1,2020-03-02 00:57,586,,,1.2342664682898268e+18,@,https://twitter.com/lewinskylou2/status/123426...,252,Volgens minister Bruno Bruins (Medische Zorg) ...,
2,2,2020-03-02 00:56,0,,,1.2342662720635412e+18,,https://twitter.com/Watskeburtinmi1/status/123...,0,Carnavalsstichting Tilburg schrikt van twee ni...,
3,3,2020-03-02 00:51,0,,# # # #,1.234265145389527e+18,@,https://twitter.com/Gabber07/status/1234265145...,1,LIVE - Patiënte in Beatrixziekenhuis Gorinchem...,
4,4,2020-03-02 00:51,0,,,1.2342650855996006e+18,@,https://twitter.com/stellardoor24/status/12342...,0,Coronavirus Cold Open - SNL https://youtu.be/H...,


In [23]:
AllTweets.permalink.loc[4]

'https://twitter.com/stellardoor24/status/1234265085599600641'

# Analysing news tweets

Removing news tweets to just showing people interacting instead of sharing news article

In [33]:
def RemoveNewsTweets(DataFrame):
    """
    Takes dataframe with tweets and returns the same dataframe without all the news tweets (removes every tweet with a link)
    """
    pattern = 'https:\/\/|http:\/\/'
    length = len(DataFrame)
    for n in range(length):
        clear_output(wait=True)
        result = re.search(pattern, AllTweets.text.loc[n])
        if result:
            DataFrame = DataFrame.drop(n)
        else:
            continue
        print("Current progress: "+ str(np.round(n/length*100,2)),"%")
    return DataFrame

In [34]:
%time AllnonNewsTweets = RemoveNewsTweets(AllTweets)

Current progress: 100.0 %
Wall time: 1h 20min 5s


In [35]:
AllnonNewsTweets

Unnamed: 0,index,date,favorites,geo,hashtags,id,mentions,permalink,retweets,text,username
0,0,2020-03-02 00:59,1,,,1.2342670258093302e+18,,https://twitter.com/kevindienda/status/1234267...,0,sommige van jullie willen interessant doen met...,
6,6,2020-03-02 00:49,0,,,1.2342644252275302e+18,@,https://twitter.com/Aislin37081715/status/1234...,0,De likes vind ik net zo walgelijk als de tweet...,
8,8,2020-03-02 00:46,0,,,1.234263841913086e+18,,https://twitter.com/ElPee39284996/status/12342...,0,De inwoners van Lesbos zien de migranten als e...,
9,9,2020-03-02 00:45,0,,# #,1.2342635473345495e+18,,https://twitter.com/MariaVANL/status/123426354...,0,# marcvanranst doet die trui niet meer uit tot...,
10,10,2020-03-02 00:45,0,,,1.2342635292990874e+18,,https://twitter.com/APooh2310/status/123426352...,0,1. Afgezien daarvan; de EU (en aangesloten lan...,
...,...,...,...,...,...,...,...,...,...,...,...
165252,3563,2020-02-29 01:12,20,,# #,1.2335455419733975e+18,,https://twitter.com/piep_kuiken/status/1233545...,6,Journalist tegen bejaarde mevrouw: ‘Is # coron...,
165254,3565,2020-02-29 01:05,0,,#,1.2335437430154732e+18,,https://twitter.com/LeavingHolland/status/1233...,0,Stel je wordt nu griepig. T is er de tijd van ...,
165256,3567,2020-02-29 01:02,0,,# #,1.2335431368839373e+18,,https://twitter.com/gewoonmens/status/12335431...,1,Morgen maar even bellen met de ggd. Die lui di...,
165257,3568,2020-02-29 01:02,1,,,1.233543045389353e+18,,https://twitter.com/LavendelSnuiver/status/123...,0,De mutatie van de mens als gevolg van het coro...,


#### Save as .csv

In [36]:
AllnonNewsTweets.to_csv("Data/AllnonLinkTweets.csv")

### Get all Urls in twitter links

In [37]:
def GetAllLinksIntweets(DataFrame):
    """
    Takes dataframe with tweets and returns the same dataframe without all the news tweets (removes every tweet with a link)
    """
    pattern = '(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})'
    length = len(DataFrame)
    UrlList = []
    for n in range(length):
        clear_output(wait=True)
        result = re.search(pattern, AllTweets.text.loc[n])
        if result:
            UrlList.append(result.group())
        else:
            continue
        print("Current progress: "+ str(np.round(n/length*100,2)),"%")
    return UrlList

In [38]:
%time AllUrlsInTweets = GetAllLinksIntweets(AllTweets)

Current progress: 99.73 %


In [40]:
Counter(AllUrlsInTweets).most_common(10)

[('https://youtu.be/H4qvO0StKto', 242),
 ('https://www.rtlnieuws.nl/nieuws/nederland/artikel/5056536/sehraz-breda-coronavirus-16-intensive-care-besmet-covid19',
  224),
 ('http://AD.nl', 214),
 ('https://youtu.be/XezLiezWN0E', 185),
 ('http://NU.nl', 167),
 ('https://nos.nl/l/2327194', 151),
 ('https://www.rtlnieuws.nl/nieuws/nederland/artikel/5056941/ggd-chef-al-6000-mensen-besmet-met-coronaviurs',
  144),
 ('http://cutt.ly/8e5346v', 137),
 ('https://www.rivm.nl/nieuws/actuele-informatie-over-coronavirus', 118),
 ('http://Rijksoverheid.nl', 109)]

### Get all domain an their appearance

In [41]:
PreFixes = []
pattern = 'http(?:s)?:\/\/(?:[\w-]+\.)*([\w-]{1,63})(?:\.(?:\w{3}|\w{2}))(?:$|\/)'
for x in AllUrlsInTweets:
    result = re.search(pattern, x)
    if result:
        PreFixes.append(result.group())

In [42]:
Counter(PreFixes).most_common(1000)

[('https://twitter.com/', 13176),
 ('http://dlvr.it/', 4826),
 ('http://bit.ly/', 4129),
 ('https://www.telegraaf.nl/', 3643),
 ('https://www.ad.nl/', 2207),
 ('https://www.nu.nl/', 2113),
 ('https://nos.nl/', 2027),
 ('https://nieuwsblik.nl/', 1940),
 ('https://youtu.be/', 1879),
 ('https://buff.ly/', 1565),
 ('https://ift.tt/', 1544),
 ('https://watskeburtinmijnstraat.nl/', 1375),
 ('https://www.hln.be/', 1332),
 ('https://www.rtlnieuws.nl/', 1257),
 ('https://bit.ly/', 952),
 ('http://zpr.io/', 892),
 ('https://www.instagram.com/', 765),
 ('https://goo.gl/', 754),
 ('https://www.volkskrant.nl/', 743),
 ('https://vrtnws.be/', 644),
 ('http://a.msn.com/', 635),
 ('https://www.nrc.nl/', 551),
 ('https://www.parool.nl/', 531),
 ('http://twib.in/', 495),
 ('https://www.nos.nl/', 494),
 ('https://brabantn.ws/', 476),
 ('http://enz.nl/', 467),
 ('http://ow.ly/', 458),
 ('https://lnkd.in/', 422),
 ('https://www.gelderlander.nl/', 379),
 ('https://www.youtube.com/', 356),
 ('https://www.rivm

###