# FIFA World Cup Qatar 2022 Twitter Analysis

In this analysis we seek to provide a visualization of the tweets published under the hashtag #WorldCupQatar2022 in its first week since its inauguration. 
<br>
<br>
We will be able to see some basic statistics of the tweets and answer questions about where, who and in what language. We will also be able to see a word cloud with the most representative words of the World Cup tweets.
We wish the best team to win!

In [4]:
# Import basic libraries
import pandas as pd
import numpy as np
import altair as alt

In [2]:
#!pip3 install snscrape

In [3]:
#!pip install altair vega_datasets

In [4]:
#!pip install snscrape

In [5]:
# librerías
import snscrape.modules.twitter as sntwitter

In [None]:
%%time

# Parámetros
tweets_list_mundial3 = []
maxTweets_mundial3 = 250_000
date_initial = "2022-11-20"

# Get tweets
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('#WorldcupQatar2022 since:'+date_initial).get_items()): # se puede añadir esto --> since:'+date_initial
        if i>maxTweets_mundial3-1:
            break
        tweets_list_mundial3.append([tweet.user.username, tweet.date, tweet.id, tweet.content, tweet.url, tweet.lang,
                    tweet.hashtags, tweet.likeCount, tweet.replyCount, tweet.retweetCount, tweet.quoteCount, 
                    tweet.mentionedUsers, tweet.coordinates, tweet.place, tweet.cashtags, tweet.media])
        # Otra info que puede sernos útil: tweet.media,  tweet.url

In [None]:
# Pandas dataframe con tweets de los tres presidentes relacionados con el worldcup
column_names = ("username","date","id","content","url","language","hashtags",
                "likes_count","reply_count","retweet_count","quote_count","mentioned_users","coordinates","place","cashtags", "media")
df = pd.DataFrame(tweets_list_mundial3, columns=column_names)
df

In [6]:
#df.to_csv("data/FIFA World Cup Qatar 2022 Twitter_v2.csv")

In [7]:
df = pd.read_csv("../data/FIFA World Cup Qatar 2022 Twitter Raw.csv")

In [8]:
df.dtypes

Unnamed: 0          int64
username           object
date               object
id                  int64
content            object
url                object
language           object
hashtags           object
likes_count         int64
reply_count         int64
retweet_count       int64
quote_count         int64
mentioned_users    object
coordinates        object
place              object
cashtags           object
media              object
dtype: object

In [9]:
df.isnull().sum()

Unnamed: 0              0
username                4
date                    0
id                      0
content                 0
url                     0
language                0
hashtags               11
likes_count             0
reply_count             0
retweet_count           0
quote_count             0
mentioned_users    161225
coordinates        173017
place              173018
cashtags           183841
media              122800
dtype: int64

## In which language are most of the world cup tweets made?

In [10]:
# Número de comentarios por idioma
print("\nTweets by language, from November 20, 2022 to November 26, 2022:\n")
df_language=pd.DataFrame(df['language'].value_counts()).reset_index().rename(columns={'index':'language', 'language':'tweets'})


df_language.head()


Tweets by language, from November 20, 2022 to November 26, 2022:



Unnamed: 0,language,tweets
0,en,87445
1,pt,22631
2,es,16353
3,fr,8836
4,ja,7613


In [11]:
#interpret language code
tables = pd.read_csv("https://datahub.io/core/language-codes/r/language-codes.csv")
tables.rename(columns = {'alpha2':'language'}, inplace = True)
tables.rename(columns = {'English':'language_comp'}, inplace = True)
tables.head()

Unnamed: 0,language,language_comp
0,aa,Afar
1,ab,Abkhazian
2,ae,Avestan
3,af,Afrikaans
4,ak,Akan


In [12]:
tables.shape

(184, 2)

In [13]:
n = len(pd.unique(tables['language']))
print(n)

184


In [14]:
#interpret language code

df_language= pd.merge(df_language, tables, on="language", how="inner")
df_language.head()
#pd.merge(df_language, tables[["language", "language_comp"]],  how="left", on="language")

Unnamed: 0,language,tweets,language_comp
0,en,87445,English
1,pt,22631,Portuguese
2,es,16353,Spanish; Castilian
3,fr,8836,French
4,ja,7613,Japanese


In [15]:
# language, general plot
alt.Chart(df_language).mark_bar().encode(
    alt.X('language_comp',  sort='-y',  title="Language"),
    alt.Y('tweets', title="Tweets"),
    alt.Color('language_comp', legend=None),
    tooltip=['language_comp', 'tweets']
).properties(title="Tweets by language")

In [16]:
nr = 15
top = df_language.sort_values(by='tweets', ascending=False).head(nr)


alt.Chart(
    top,
    title='Top {} languages in tweets about the 2022 World Cup'.format(nr)
).mark_bar().encode(
    x=alt.X('language_comp:N', sort=alt.EncodingSortField(field="tweets", op="count", order='descending'), title="Language"),
    y=alt.Y('tweets', title="Tweets"),
)


In [17]:
top

Unnamed: 0,language,tweets,language_comp
0,en,87445,English
1,pt,22631,Portuguese
2,es,16353,Spanish; Castilian
3,fr,8836,French
4,ja,7613,Japanese
5,ar,7191,Arabic
6,tr,3666,Turkish
7,pl,2775,Polish
8,it,1849,Italian
9,nl,1071,Dutch; Flemish


## Which days have had the most tweets about the world cup?

In [18]:
df['date_only'] = pd.to_datetime(df['date'])

In [19]:
df.head()

Unnamed: 0.1,Unnamed: 0,username,date,id,content,url,language,hashtags,likes_count,reply_count,retweet_count,quote_count,mentioned_users,coordinates,place,cashtags,media,date_only
0,0,0KBaila_Q45,2022-11-27 04:16:55+00:00,1596719695595732993,Apakah Argentina bisa melaju ke final dan MESS...,https://twitter.com/0KBaila_Q45/status/1596719...,in,['WorldcupQatar2022'],0,0,0,0,,,,,,2022-11-27 04:16:55+00:00
1,1,SoheBoss,2022-11-27 04:16:51+00:00,1596719678050607104,@NeblioTeam @TerraRebels #WorldcupQatar2022 #B...,https://twitter.com/SoheBoss/status/1596719678...,qme,"['WorldcupQatar2022', 'Bitcoin']",0,0,0,0,"[User(username='NeblioTeam', id=87829160683054...",,,,,2022-11-27 04:16:51+00:00
2,2,AdrielVilla77,2022-11-27 04:16:48+00:00,1596719666172289024,Toda copa a musa volta com a melhor música ❤️☀...,https://twitter.com/AdrielVilla77/status/15967...,pt,"['Wakawaka', 'shakira', 'CopadoMundo', 'Worldc...",0,0,0,0,,,,,[Video(thumbnailUrl='https://pbs.twimg.com/ext...,2022-11-27 04:16:48+00:00
3,3,theUG,2022-11-27 04:16:40+00:00,1596719631317729282,MARADONA FLYING KNEE \n#FIFAWorldCup #Worldcup...,https://twitter.com/theUG/status/1596719631317...,en,"['FIFAWorldCup', 'WorldcupQatar2022']",0,0,0,0,,,,,[Video(thumbnailUrl='https://pbs.twimg.com/ext...,2022-11-27 04:16:40+00:00
4,4,DeccanChronicle,2022-11-27 04:16:36+00:00,1596719612984692736,With the @FIFAcom #WorldcupQatar2022 underway ...,https://twitter.com/DeccanChronicle/status/159...,en,"['WorldcupQatar2022', 'Inoxcinemas']",0,0,0,0,"[User(username='FIFAcom', id=140070953, displa...",,,,,2022-11-27 04:16:36+00:00


In [20]:
df["date_only"] = pd.to_datetime(df["date_only"])

In [21]:
df_date = df.groupby(df['date_only'].dt.date).size().reset_index(name='Count')
df_date.head()

Unnamed: 0,date_only,Count
0,2022-11-20,7947
1,2022-11-21,6266
2,2022-11-22,19127
3,2022-11-23,33918
4,2022-11-24,52321


In [22]:
df_date["date_only"] = pd.to_datetime(df_date["date_only"])

In [23]:
alt.Chart(df_date).mark_bar(size=25).encode(
    alt.X('date_only', title="Date"),
    alt.Y('Count:Q', title="Tweets"),
    alt.Color('date_only'),
    tooltip=['date_only', 'Count']
).properties(title="Number of tweets per day")

## Where are the people tweeting about the world cup?

In [24]:
df_map=df.loc[df['coordinates'].notnull()]

In [25]:
df_map.head()

Unnamed: 0.1,Unnamed: 0,username,date,id,content,url,language,hashtags,likes_count,reply_count,retweet_count,quote_count,mentioned_users,coordinates,place,cashtags,media,date_only
11,11,Arv_Ind_Chauhan,2022-11-27 04:14:24+00:00,1596719061265567744,That lucky 🐐 made it to feature page of @ttind...,https://twitter.com/Arv_Ind_Chauhan/status/159...,en,"['GOAT𓃵', 'WorldcupQatar2022']",1,0,0,0,"[User(username='ttindia', id=141080858, displa...","Coordinates(longitude=80.658613, latitude=26.6...","Place(fullName='Lucknow, India', name='Lucknow...",,[Photo(previewUrl='https://pbs.twimg.com/media...,2022-11-27 04:14:24+00:00
14,14,Ger_mannn,2022-11-27 04:13:36+00:00,1596718858957914112,You can spit in a young girls face you can red...,https://twitter.com/Ger_mannn/status/159671885...,en,['WorldcupQatar2022'],0,0,0,0,"[User(username='Carra23', id=2585396407, displ...","Coordinates(longitude=-6.6960971, latitude=53....","Place(fullName='Louth, Ireland', name='Louth',...",,,2022-11-27 04:13:36+00:00
91,91,GustavoEscobarG,2022-11-27 04:01:11+00:00,1596715734188466176,Tremendo baile en el #EcuadorvsNetherlands va...,https://twitter.com/GustavoEscobarG/status/159...,es,"['EcuadorvsNetherlands', 'ecu', 'WorldcupQatar...",0,0,0,0,"[User(username='fifaworldcup_es', id=142296675...","Coordinates(longitude=-80.4676050033072, latit...","Place(fullName='Guayaquil, Ecuador', name='Gua...",,[Video(thumbnailUrl='https://pbs.twimg.com/ext...,2022-11-27 04:01:11+00:00
132,132,macec88,2022-11-27 03:53:05+00:00,1596713694989537280,Just a couple of mate from High School still h...,https://twitter.com/macec88/status/15967136949...,en,['WorldCupQatar2022'],5,0,0,0,"[User(username='FedSquare', id=34868390, displ...","Coordinates(longitude=144.96796240787086, lati...","Place(fullName='Federation Square', name='Fede...",,[Photo(previewUrl='https://pbs.twimg.com/media...,2022-11-27 03:53:05+00:00
143,143,preethisadasiv,2022-11-27 03:51:31+00:00,1596713303186677760,What do you call that?!!😇\n#Messi𓃵\n#Argentina...,https://twitter.com/preethisadasiv/status/1596...,en,"['Messi𓃵', 'ArgentinavsMexico', 'WorldcupQatar...",3,1,0,0,,"Coordinates(longitude=76.7996471, latitude=8.3...","Place(fullName='Trivandrum, India', name='Triv...",,[Video(thumbnailUrl='https://pbs.twimg.com/ext...,2022-11-27 03:51:31+00:00


In [26]:
print("There are {} tweets with coordinates of the place of publication".format(df_map.shape[0]))

There are 11223 tweets with coordinates of the place of publication


In [27]:
df_map.head()

Unnamed: 0.1,Unnamed: 0,username,date,id,content,url,language,hashtags,likes_count,reply_count,retweet_count,quote_count,mentioned_users,coordinates,place,cashtags,media,date_only
11,11,Arv_Ind_Chauhan,2022-11-27 04:14:24+00:00,1596719061265567744,That lucky 🐐 made it to feature page of @ttind...,https://twitter.com/Arv_Ind_Chauhan/status/159...,en,"['GOAT𓃵', 'WorldcupQatar2022']",1,0,0,0,"[User(username='ttindia', id=141080858, displa...","Coordinates(longitude=80.658613, latitude=26.6...","Place(fullName='Lucknow, India', name='Lucknow...",,[Photo(previewUrl='https://pbs.twimg.com/media...,2022-11-27 04:14:24+00:00
14,14,Ger_mannn,2022-11-27 04:13:36+00:00,1596718858957914112,You can spit in a young girls face you can red...,https://twitter.com/Ger_mannn/status/159671885...,en,['WorldcupQatar2022'],0,0,0,0,"[User(username='Carra23', id=2585396407, displ...","Coordinates(longitude=-6.6960971, latitude=53....","Place(fullName='Louth, Ireland', name='Louth',...",,,2022-11-27 04:13:36+00:00
91,91,GustavoEscobarG,2022-11-27 04:01:11+00:00,1596715734188466176,Tremendo baile en el #EcuadorvsNetherlands va...,https://twitter.com/GustavoEscobarG/status/159...,es,"['EcuadorvsNetherlands', 'ecu', 'WorldcupQatar...",0,0,0,0,"[User(username='fifaworldcup_es', id=142296675...","Coordinates(longitude=-80.4676050033072, latit...","Place(fullName='Guayaquil, Ecuador', name='Gua...",,[Video(thumbnailUrl='https://pbs.twimg.com/ext...,2022-11-27 04:01:11+00:00
132,132,macec88,2022-11-27 03:53:05+00:00,1596713694989537280,Just a couple of mate from High School still h...,https://twitter.com/macec88/status/15967136949...,en,['WorldCupQatar2022'],5,0,0,0,"[User(username='FedSquare', id=34868390, displ...","Coordinates(longitude=144.96796240787086, lati...","Place(fullName='Federation Square', name='Fede...",,[Photo(previewUrl='https://pbs.twimg.com/media...,2022-11-27 03:53:05+00:00
143,143,preethisadasiv,2022-11-27 03:51:31+00:00,1596713303186677760,What do you call that?!!😇\n#Messi𓃵\n#Argentina...,https://twitter.com/preethisadasiv/status/1596...,en,"['Messi𓃵', 'ArgentinavsMexico', 'WorldcupQatar...",3,1,0,0,,"Coordinates(longitude=76.7996471, latitude=8.3...","Place(fullName='Trivandrum, India', name='Triv...",,[Video(thumbnailUrl='https://pbs.twimg.com/ext...,2022-11-27 03:51:31+00:00


In [28]:
coordinates = df[df['coordinates'].notnull()]['coordinates']
coordList = coordinates.tolist()

def GetLongitudeLatitude(coordList):
    splittedList = []
    for item in coordList:
        
        splitted = item.split('=')
        long = splitted[1].split(',')[0]
        lat = splitted[2][:-1]
        cords= [long,lat]
        splittedList.append(cords)
        
    return splittedList

myCords = GetLongitudeLatitude(coordList)
coordDataFrame = pd.DataFrame(myCords, columns=['longitude','latitude'])

coordDataFrame.head()
coordDataFrame.head()

Unnamed: 0,longitude,latitude
0,80.658613,26.642291
1,-6.6960971,53.6985389
2,-80.4676050033072,-3.06371600046572
3,144.96796240787086,-37.81808406931686
4,76.7996471,8.3860508


In [29]:
import folium
from folium.plugins import MarkerCluster
from folium import plugins
from folium.plugins import FastMarkerCluster
from folium.plugins import HeatMap

locations = list(zip(coordDataFrame.latitude, coordDataFrame.longitude))

map1 = folium.Map(location=[0,0], zoom_start=2)
FastMarkerCluster(data=locations).add_to(map1)
map1

ModuleNotFoundError: No module named 'folium'

In [30]:
import folium
from folium.plugins import HeatMap
map_folium = folium.Map([0,0],zoom_start=2)
HeatMap(coordDataFrame[['latitude','longitude']].dropna(),radius=9,gradient={0.2:'blue',0.4:'purple',0.6:'orange',1.0:'red'}).add_to(map_folium)
display(map_folium)

ModuleNotFoundError: No module named 'folium'

In [96]:
coordDataFrame['address'] = coordDataFrame.apply(
    lambda row: geolocator.reverse((row['latitude'], row['longitude'])), axis=1)

In [99]:
coordDataFrame.head()

Unnamed: 0,longitude,latitude,geometry,address
0,80.658613,26.642291,POINT (80.65861 26.64229),"(Kusumbhi, Kanpur-Lucknow Road, Nawab Ganj, Ha..."
1,-6.6960971,53.6985389,POINT (-6.69610 53.69854),"(R163, Donaghpatrick ED, The Municipal Distric..."
2,-80.4676050033072,-3.06371600046572,POINT (-80.46761 -3.06372),"(Ecuador, (-1.3397668, -79.3666965))"
3,144.96796240787086,-37.81808406931686,POINT (144.96796 -37.81808),"(Federation Square, Swanston Street, Melbourne..."
4,76.7996471,8.3860508,POINT (76.79965 8.38605),"(Kanyakumari, Kanyakumari District, Tamil Nadu..."


In [121]:
import geopandas as gpd
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

In [122]:
# Convert Long Lat into numeric type
coordDataFrame['longitude'] = pd.to_numeric(coordDataFrame['longitude'])
coordDataFrame['latitude'] = pd.to_numeric(coordDataFrame['latitude'])
 
# Convert Long Lat into Point Geometry
coordDataFrame = gpd.GeoDataFrame(coordDataFrame, geometry = gpd.points_from_xy(x=coordDataFrame['longitude'], y=coordDataFrame['latitude']))
 
# Set CRS
coordDataFrame = coordDataFrame.set_crs('EPSG:4326')

In [123]:
world.head()

Unnamed: 0,pop_est,continent,name,iso_a3,gdp_md_est,geometry
0,889953.0,Oceania,Fiji,FJI,5496,"MULTIPOLYGON (((180.00000 -16.06713, 180.00000..."
1,58005463.0,Africa,Tanzania,TZA,63177,"POLYGON ((33.90371 -0.95000, 34.07262 -1.05982..."
2,603253.0,Africa,W. Sahara,ESH,907,"POLYGON ((-8.66559 27.65643, -8.66512 27.58948..."
3,37589262.0,North America,Canada,CAN,1736425,"MULTIPOLYGON (((-122.84000 49.00000, -122.9742..."
4,328239523.0,North America,United States of America,USA,21433226,"MULTIPOLYGON (((-122.84000 49.00000, -120.0000..."


In [124]:
world=world.join(
    gpd.sjoin(coordDataFrame, world).groupby("index_right").size().rename("points"),
    how="left",
)

In [126]:
world['points'] = world['points'].fillna(0)
world['points'] = world['points'].astype(int)
world.head()

Unnamed: 0,pop_est,continent,name,iso_a3,gdp_md_est,geometry,points
0,889953.0,Oceania,Fiji,FJI,5496,"MULTIPOLYGON (((180.00000 -16.06713, 180.00000...",0
1,58005463.0,Africa,Tanzania,TZA,63177,"POLYGON ((33.90371 -0.95000, 34.07262 -1.05982...",45
2,603253.0,Africa,W. Sahara,ESH,907,"POLYGON ((-8.66559 27.65643, -8.66512 27.58948...",0
3,37589262.0,North America,Canada,CAN,1736425,"MULTIPOLYGON (((-122.84000 49.00000, -122.9742...",202
4,328239523.0,North America,United States of America,USA,21433226,"MULTIPOLYGON (((-122.84000 49.00000, -120.0000...",1635


In [135]:
print("\nCountries with the most likes on their tweets, from November 20, 2022 to November 26, 2022:\n")
df_country=world.pivot_table(values='points', index = 'name', aggfunc= 'sum')
df_country=df_country.sort_values(by=['points'], ascending=False)
df_country.head(15)


Countries with the most likes on their tweets, from November 20, 2022 to November 26, 2022:



Unnamed: 0_level_0,points
name,Unnamed: 1_level_1
Brazil,1948
United States of America,1635
United Kingdom,1251
Qatar,484
Saudi Arabia,282
India,249
Mexico,219
Indonesia,209
Japan,207
Canada,202


## Which users have the most likes?

In [274]:
print("\nUsers with the most likes on their tweets, from November 20, 2022 to November 26, 2022:\n")
df_user=df.pivot_table(values='likes_count', index = 'username', aggfunc= 'sum')
df_user=df_user.sort_values(by=['likes_count'], ascending=False)
df_user.head(15)


Users with the most likes on their tweets, from November 20, 2022 to November 26, 2022:



Unnamed: 0_level_0,likes_count
username,Unnamed: 1_level_1
Carra23,131502
beINSPORTS,84088
Saudi_Gazette,35543
anandmahindra,35161
CBF_Futebol,28174
BlazquezFont,23761
GIMS,23111
SobugMessi10jr,20330
1209msg,17223
sniper_ma,10449


# #HumanRights

As we know, one of the biggest controversies of the Qatar 2022 World Cup is the breach of human rights by the host country. We will see how many tweets refer to this aspect.

In [186]:
df_human=df.loc[df.hashtags.str.contains("HumanRights", na=False)]

In [187]:
df_human.head()

Unnamed: 0.1,Unnamed: 0,username,date,id,content,url,language,hashtags,likes_count,reply_count,retweet_count,quote_count,mentioned_users,coordinates,place,cashtags,media,date_only,tweet_clean
41,41,markgstachowski,2022-11-27 04:08:58+00:00,1596717694673469440,@POTUS @SenateGOP @HouseGOP @dscc @HouseDemocr...,https://twitter.com/markgstachowski/status/159...,en,"['Qatar', 'HumanRightsViolations', 'WorldcupQa...",0,0,0,0,"[User(username='POTUS', id=1349149096909668363...",,,,,2022-11-27 04:08:58+00:00,"[@potus, @senategop, @housegop, @dscc, @housed..."
734,734,KevinSojoodi,2022-11-27 02:01:49+00:00,1596685693954461696,#WorldcupQatar2022 is just a small example of ...,https://twitter.com/KevinSojoodi/status/159668...,en,"['WorldcupQatar2022', 'HumanRights', 'islamic'...",2,2,1,1,,,,,[Video(thumbnailUrl='https://pbs.twimg.com/ext...,2022-11-27 02:01:49+00:00,"[#worldcupqatar2022, is, just, a, small, examp..."
1115,1115,therapist_gay,2022-11-27 00:36:21+00:00,1596664185852547072,LGBTQ folx and allies keep finding clever ways...,https://twitter.com/therapist_gay/status/15966...,en,"['LGBT', 'LGBTQ', 'LGBTQIA', 'Qatar', 'Rainbow...",0,0,0,0,,,,,,2022-11-27 00:36:21+00:00,"[lgbtq, folx, and, allies, keep, finding, clev..."
1438,1438,U_N_I_T_E_N_O_W,2022-11-27 00:00:21+00:00,1596655126227615745,The REAL Reason Europe Took Over the World htt...,https://twitter.com/U_N_I_T_E_N_O_W/status/159...,en,"['WorldcupQatar2022', 'Baudet', 'HumanRights',...",0,0,0,0,"[User(username='YouTube', id=10228272, display...",,,,,2022-11-27 00:00:21+00:00,"[the, real, reason, europe, took, over, the, w..."
1759,1759,dr_byron24,2022-11-26 23:27:37+00:00,1596646889159467008,BREAKING: the Spanish National Soccer Team at ...,https://twitter.com/dr_byron24/status/15966468...,en,"['WorldcupQatar2022', 'Interpol', 'UN', 'Gobie...",0,0,0,0,,,,,,2022-11-26 23:27:37+00:00,"[breaking, the, spanish, national, soccer, tea..."


In [188]:
print('There are {} tweets with hashtags #WorldCupQatar2022 and #HumanRights'.format(len(df_human)))

There are 215 tweets with hashtags #WorldCupQatar2022 and #HumanRights


In [190]:
pd.set_option('display.max_colwidth', None)
df_human.loc[:,"content"]

41            @POTUS @SenateGOP @HouseGOP @dscc @HouseDemocrats just as #Qatar is being globally criticized for its #HumanRightsViolations leading to and during #WorldcupQatar2022, the #USA will face criticism for its #GUNviolence in #WorldCup2026. Now is the time do something to avoid it.
734                                                                                 #WorldcupQatar2022 is just a small example of #HumanRights situation in #islamic countries! #TheWorld should rise with women of #Iran! #IranRevolution #WomanLifeFreedom #Antisemitism https://t.co/JcIKsQSBXj
1115      LGBTQ folx and allies keep finding clever ways to defy Qatar’s ban of rainbow flags at the World Cup https://t.co/lAnVe50Lcu #LGBT #LGBTQ #LGBTQIA #Qatar #RainbowFlag #WorldCup #WorldcupQatar2022 #Pride #GayRights #HumanRights #HumanRightsViolations #TamimbinHamadAlThaniSucksDick
1438                                                                                                                           

# Cleaning Data

## Remove punctuation marks

In [31]:
#library that contains punctuation
import string
signos_puntuacion = string.punctuation
signos_puntuacion = signos_puntuacion.replace("#", "")  # quitar '#' de hashtags 
signos_puntuacion = signos_puntuacion.replace("@", "")  # quitar '@' de cuentas
signos_puntuacion

'!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~'

In [32]:
#defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in signos_puntuacion])
    return punctuationfree

#storing the puntuation free text
df['tweet_clean'] = df['content'].apply(lambda x:remove_punctuation(x))
df.head()

Unnamed: 0.1,Unnamed: 0,username,date,id,content,url,language,hashtags,likes_count,reply_count,retweet_count,quote_count,mentioned_users,coordinates,place,cashtags,media,date_only,tweet_clean
0,0,0KBaila_Q45,2022-11-27 04:16:55+00:00,1596719695595732993,Apakah Argentina bisa melaju ke final dan MESS...,https://twitter.com/0KBaila_Q45/status/1596719...,in,['WorldcupQatar2022'],0,0,0,0,,,,,,2022-11-27 04:16:55+00:00,Apakah Argentina bisa melaju ke final dan MESS...
1,1,SoheBoss,2022-11-27 04:16:51+00:00,1596719678050607104,@NeblioTeam @TerraRebels #WorldcupQatar2022 #B...,https://twitter.com/SoheBoss/status/1596719678...,qme,"['WorldcupQatar2022', 'Bitcoin']",0,0,0,0,"[User(username='NeblioTeam', id=87829160683054...",,,,,2022-11-27 04:16:51+00:00,@NeblioTeam @TerraRebels #WorldcupQatar2022 #B...
2,2,AdrielVilla77,2022-11-27 04:16:48+00:00,1596719666172289024,Toda copa a musa volta com a melhor música ❤️☀...,https://twitter.com/AdrielVilla77/status/15967...,pt,"['Wakawaka', 'shakira', 'CopadoMundo', 'Worldc...",0,0,0,0,,,,,[Video(thumbnailUrl='https://pbs.twimg.com/ext...,2022-11-27 04:16:48+00:00,Toda copa a musa volta com a melhor música ❤️☀...
3,3,theUG,2022-11-27 04:16:40+00:00,1596719631317729282,MARADONA FLYING KNEE \n#FIFAWorldCup #Worldcup...,https://twitter.com/theUG/status/1596719631317...,en,"['FIFAWorldCup', 'WorldcupQatar2022']",0,0,0,0,,,,,[Video(thumbnailUrl='https://pbs.twimg.com/ext...,2022-11-27 04:16:40+00:00,MARADONA FLYING KNEE \n#FIFAWorldCup #Worldcup...
4,4,DeccanChronicle,2022-11-27 04:16:36+00:00,1596719612984692736,With the @FIFAcom #WorldcupQatar2022 underway ...,https://twitter.com/DeccanChronicle/status/159...,en,"['WorldcupQatar2022', 'Inoxcinemas']",0,0,0,0,"[User(username='FIFAcom', id=140070953, displa...",,,,,2022-11-27 04:16:36+00:00,With the @FIFAcom #WorldcupQatar2022 underway ...


## Lowercase texts

Part of the normalization is to put all the texts in lowercase (or uppercase), this can be used later, for example, to do a word count.

⚠️ Note: When doing this there is a risk of losing contextual information; For example, within a tweet when someone writes a specific word in upper case and the rest in lower case, this leads to a greater importance or increase in tone of voice.

In [33]:
# cambiar los tweets a letras minúsculas
df['tweet_clean'] = df['tweet_clean'].apply((lambda x: x.lower()))
df.head()

Unnamed: 0.1,Unnamed: 0,username,date,id,content,url,language,hashtags,likes_count,reply_count,retweet_count,quote_count,mentioned_users,coordinates,place,cashtags,media,date_only,tweet_clean
0,0,0KBaila_Q45,2022-11-27 04:16:55+00:00,1596719695595732993,Apakah Argentina bisa melaju ke final dan MESS...,https://twitter.com/0KBaila_Q45/status/1596719...,in,['WorldcupQatar2022'],0,0,0,0,,,,,,2022-11-27 04:16:55+00:00,apakah argentina bisa melaju ke final dan mess...
1,1,SoheBoss,2022-11-27 04:16:51+00:00,1596719678050607104,@NeblioTeam @TerraRebels #WorldcupQatar2022 #B...,https://twitter.com/SoheBoss/status/1596719678...,qme,"['WorldcupQatar2022', 'Bitcoin']",0,0,0,0,"[User(username='NeblioTeam', id=87829160683054...",,,,,2022-11-27 04:16:51+00:00,@neblioteam @terrarebels #worldcupqatar2022 #b...
2,2,AdrielVilla77,2022-11-27 04:16:48+00:00,1596719666172289024,Toda copa a musa volta com a melhor música ❤️☀...,https://twitter.com/AdrielVilla77/status/15967...,pt,"['Wakawaka', 'shakira', 'CopadoMundo', 'Worldc...",0,0,0,0,,,,,[Video(thumbnailUrl='https://pbs.twimg.com/ext...,2022-11-27 04:16:48+00:00,toda copa a musa volta com a melhor música ❤️☀...
3,3,theUG,2022-11-27 04:16:40+00:00,1596719631317729282,MARADONA FLYING KNEE \n#FIFAWorldCup #Worldcup...,https://twitter.com/theUG/status/1596719631317...,en,"['FIFAWorldCup', 'WorldcupQatar2022']",0,0,0,0,,,,,[Video(thumbnailUrl='https://pbs.twimg.com/ext...,2022-11-27 04:16:40+00:00,maradona flying knee \n#fifaworldcup #worldcup...
4,4,DeccanChronicle,2022-11-27 04:16:36+00:00,1596719612984692736,With the @FIFAcom #WorldcupQatar2022 underway ...,https://twitter.com/DeccanChronicle/status/159...,en,"['WorldcupQatar2022', 'Inoxcinemas']",0,0,0,0,"[User(username='FIFAcom', id=140070953, displa...",,,,,2022-11-27 04:16:36+00:00,with the @fifacom #worldcupqatar2022 underway ...


## Tokenization

In [34]:
# getting the tokens per tweet
df['tweet_clean'] = df['tweet_clean'].apply(lambda x: x.rsplit())
df.head()

Unnamed: 0.1,Unnamed: 0,username,date,id,content,url,language,hashtags,likes_count,reply_count,retweet_count,quote_count,mentioned_users,coordinates,place,cashtags,media,date_only,tweet_clean
0,0,0KBaila_Q45,2022-11-27 04:16:55+00:00,1596719695595732993,Apakah Argentina bisa melaju ke final dan MESS...,https://twitter.com/0KBaila_Q45/status/1596719...,in,['WorldcupQatar2022'],0,0,0,0,,,,,,2022-11-27 04:16:55+00:00,"[apakah, argentina, bisa, melaju, ke, final, d..."
1,1,SoheBoss,2022-11-27 04:16:51+00:00,1596719678050607104,@NeblioTeam @TerraRebels #WorldcupQatar2022 #B...,https://twitter.com/SoheBoss/status/1596719678...,qme,"['WorldcupQatar2022', 'Bitcoin']",0,0,0,0,"[User(username='NeblioTeam', id=87829160683054...",,,,,2022-11-27 04:16:51+00:00,"[@neblioteam, @terrarebels, #worldcupqatar2022..."
2,2,AdrielVilla77,2022-11-27 04:16:48+00:00,1596719666172289024,Toda copa a musa volta com a melhor música ❤️☀...,https://twitter.com/AdrielVilla77/status/15967...,pt,"['Wakawaka', 'shakira', 'CopadoMundo', 'Worldc...",0,0,0,0,,,,,[Video(thumbnailUrl='https://pbs.twimg.com/ext...,2022-11-27 04:16:48+00:00,"[toda, copa, a, musa, volta, com, a, melhor, m..."
3,3,theUG,2022-11-27 04:16:40+00:00,1596719631317729282,MARADONA FLYING KNEE \n#FIFAWorldCup #Worldcup...,https://twitter.com/theUG/status/1596719631317...,en,"['FIFAWorldCup', 'WorldcupQatar2022']",0,0,0,0,,,,,[Video(thumbnailUrl='https://pbs.twimg.com/ext...,2022-11-27 04:16:40+00:00,"[maradona, flying, knee, #fifaworldcup, #world..."
4,4,DeccanChronicle,2022-11-27 04:16:36+00:00,1596719612984692736,With the @FIFAcom #WorldcupQatar2022 underway ...,https://twitter.com/DeccanChronicle/status/159...,en,"['WorldcupQatar2022', 'Inoxcinemas']",0,0,0,0,"[User(username='FIFAcom', id=140070953, displa...",,,,,2022-11-27 04:16:36+00:00,"[with, the, @fifacom, #worldcupqatar2022, unde..."


# WordClouds

In [None]:
#!pip install wordcloud

In [None]:
#!pip install nltk

In [35]:
df['tweet_clean'].head()

0    [apakah, argentina, bisa, melaju, ke, final, d...
1    [@neblioteam, @terrarebels, #worldcupqatar2022...
2    [toda, copa, a, musa, volta, com, a, melhor, m...
3    [maradona, flying, knee, #fifaworldcup, #world...
4    [with, the, @fifacom, #worldcupqatar2022, unde...
Name: tweet_clean, dtype: object

In [37]:
englishTwitts = df[df['language'] == 'en']
englishTwitts.head()

Unnamed: 0.1,Unnamed: 0,username,date,id,content,url,language,hashtags,likes_count,reply_count,retweet_count,quote_count,mentioned_users,coordinates,place,cashtags,media,date_only,tweet_clean
3,3,theUG,2022-11-27 04:16:40+00:00,1596719631317729282,MARADONA FLYING KNEE \n#FIFAWorldCup #Worldcup...,https://twitter.com/theUG/status/1596719631317...,en,"['FIFAWorldCup', 'WorldcupQatar2022']",0,0,0,0,,,,,[Video(thumbnailUrl='https://pbs.twimg.com/ext...,2022-11-27 04:16:40+00:00,"[maradona, flying, knee, #fifaworldcup, #world..."
4,4,DeccanChronicle,2022-11-27 04:16:36+00:00,1596719612984692736,With the @FIFAcom #WorldcupQatar2022 underway ...,https://twitter.com/DeccanChronicle/status/159...,en,"['WorldcupQatar2022', 'Inoxcinemas']",0,0,0,0,"[User(username='FIFAcom', id=140070953, displa...",,,,,2022-11-27 04:16:36+00:00,"[with, the, @fifacom, #worldcupqatar2022, unde..."
5,5,MosesKgopa,2022-11-27 04:16:25+00:00,1596719568357130240,Lea tseba mabhaku ka malebe mara....🤔🤔🤔?\n\nTh...,https://twitter.com/MosesKgopa/status/15967195...,en,"['hollywoodbets', 'betwaycodes', 'WorldcupQata...",0,0,0,0,,,,,[Photo(previewUrl='https://pbs.twimg.com/media...,2022-11-27 04:16:25+00:00,"[lea, tseba, mabhaku, ka, malebe, mara🤔🤔🤔, tha..."
6,6,ahmed_baokbah,2022-11-27 04:16:23+00:00,1596719561826390017,I just realized USA and Iran are playing next ...,https://twitter.com/ahmed_baokbah/status/15967...,en,['WorldcupQatar2022'],0,0,0,0,,,,,[Photo(previewUrl='https://pbs.twimg.com/media...,2022-11-27 04:16:23+00:00,"[i, just, realized, usa, and, iran, are, playi..."
9,9,victormoh,2022-11-27 04:15:44+00:00,1596719397464584192,"Lets not use @qatarairways anymore, Qatar pol...",https://twitter.com/victormoh/status/159671939...,en,"['Qatar', 'LGBT', 'IranRevoIution', 'WorldcupQ...",1,0,0,0,"[User(username='qatarairways', id=14589119, di...",,,,[Photo(previewUrl='https://pbs.twimg.com/media...,2022-11-27 04:15:44+00:00,"[lets, not, use, @qatarairways, anymore, qatar..."


## Remove emojis

In [38]:
import re
def remove_emojis(text):
    
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                u"\U00002702-\U000027B0"
                u"\U000024C2-\U0001F251"
                u"\U0001f926-\U0001f937"
                u'\U00010000-\U0010ffff'
                u"\u200d"
                u"\u2640-\u2642"
                u"\u2600-\u2B55"
                u"\u23cf"
                u"\u23e9"
                u"\u231a"
                u"\u3030"
                u"\ufe0f"
                               "]+", flags=re.UNICODE)
                               
    output = emoji_pattern.sub(r'', text) # no emoji
   
    return output.split(" ")
 
tweetClean = englishTwitts['tweet_clean'].apply(lambda x:remove_emojis(' '.join(x)))

In [39]:
tweetClean[0:6]

3     [maradona, flying, knee, #fifaworldcup, #world...
4     [with, the, @fifacom, #worldcupqatar2022, unde...
5     [lea, tseba, mabhaku, ka, malebe, mara, that, ...
6     [i, just, realized, usa, and, iran, are, playi...
9     [lets, not, use, @qatarairways, anymore, qatar...
11    [that, lucky, , made, it, to, feature, page, o...
Name: tweet_clean, dtype: object

In [40]:
import nltk
nltk.download('stopwords')
stopwords_3 = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jorge\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [41]:

#defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords_3]
    return output



tweetClean=  tweetClean.apply(lambda x:remove_stopwords(x))


In [42]:
tweetClean[0:3]

3    [maradona, flying, knee, #fifaworldcup, #world...
4    [@fifacom, #worldcupqatar2022, underway, qatar...
5    [lea, tseba, mabhaku, ka, malebe, mara, time, ...
Name: tweet_clean, dtype: object

In [43]:
list_text = df.tweet_clean.tolist()
flatlist=[element for sublist in tweetClean for element in sublist]

In [44]:
stringText = ' '.join(map(str,flatlist))

In [45]:
from PIL import Image
from os import path
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt

In [49]:
wc_mask = np.array(Image.open( "resources/world_cup.png").convert('RGB'))
wcloud = WordCloud(width=3000, height=1700, random_state=11, max_font_size=500, 
                   background_color='white', max_words=200, collocations=False, mode='RGBA',
                   mask=wc_mask).generate(stringText)

FileNotFoundError: [Errno 2] No such file or directory: 'resources/world_cup.png'