In [228]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [229]:
#  Modificamos los parámetros de los gráficos en matplotlib
from matplotlib.pyplot import rcParams

rcParams['figure.figsize'] = 12, 6 # el primer dígito es el ancho y el segundo el alto
rcParams["font.weight"] = "bold"
rcParams["font.size"] = 10
rcParams["axes.labelweight"] = "bold"

In [230]:
df = pd.read_csv('covid19_tweets.csv')
df

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,ᏉᎥ☻լꂅϮ,astroworld,wednesday addams as a disney princess keepin i...,2017-05-26 05:46:42,624,950,18775,False,2020-07-25 12:27:21,If I smelled the scent of hand sanitizers toda...,,Twitter for iPhone,False
1,Tom Basile 🇺🇸,"New York, NY","Husband, Father, Columnist & Commentator. Auth...",2009-04-16 20:06:23,2253,1677,24,True,2020-07-25 12:27:17,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,,Twitter for Android,False
2,Time4fisticuffs,"Pewee Valley, KY",#Christian #Catholic #Conservative #Reagan #Re...,2009-02-28 18:57:41,9275,9525,7254,False,2020-07-25 12:27:14,@diane3443 @wdunlap @realDonaldTrump Trump nev...,['COVID19'],Twitter for Android,False
3,ethel mertz,Stuck in the Middle,#Browns #Indians #ClevelandProud #[]_[] #Cavs ...,2019-03-07 01:45:06,197,987,1488,False,2020-07-25 12:27:10,@brookbanktv The one gift #COVID19 has give me...,['COVID19'],Twitter for iPhone,False
4,DIPR-J&K,Jammu and Kashmir,🖊️Official Twitter handle of Department of Inf...,2017-02-12 06:45:15,101009,168,101,False,2020-07-25 12:27:08,25 July : Media Bulletin on Novel #CoronaVirus...,"['CoronaVirusUpdates', 'COVID19']",Twitter for Android,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
179103,AJIMATI AbdulRahman O.,"Ilorin, Nigeria",Animal Scientist|| Muslim|| Real Madrid/Chelsea,2013-12-30 18:59:19,412,1609,1062,False,2020-08-29 19:44:21,Thanks @IamOhmai for nominating me for the @WH...,['WearAMask'],Twitter for Android,False
179104,Jason,Ontario,When your cat has more baking soda than Ninja ...,2011-12-21 04:41:30,150,182,7295,False,2020-08-29 19:44:16,2020! The year of insanity! Lol! #COVID19 http...,['COVID19'],Twitter for Android,False
179105,BEEHEMOTH ⏳,🇨🇦 Canada,⚒️ The Architects of Free Trade ⚒️ Really Did ...,2016-07-13 17:21:59,1623,2160,98000,False,2020-08-29 19:44:15,@CTVNews A powerful painting by Juan Lucena. I...,,Twitter Web App,False
179106,Gary DelPonte,New York City,"Global UX UI Visual Designer. StoryTeller, Mus...",2009-10-27 17:43:13,1338,1111,0,False,2020-08-29 19:44:14,"More than 1,200 students test positive for #CO...",['COVID19'],Twitter for iPhone,False


In [231]:
#  Consultamos los valores nulos o perdidos en el dataset
df.isna().sum()

user_name               0
user_location       36771
user_description    10286
user_created            0
user_followers          0
user_friends            0
user_favourites         0
user_verified           0
date                    0
text                    0
hashtags            51334
source                 77
is_retweet              0
dtype: int64

In [232]:
#  Consultamos el registro de hashtags por si hay valores nulos o perdidos
df['hashtags'].isna().sum()


51334

In [233]:
#  El tamaño del dataset una vez eliminados los valores nulos o perdidos es de:
df.shape[0] - 51334

127774

In [234]:
#  Vamos a quietar los valores nulos de la columna hashtags
df_filt = df.dropna(subset=['hashtags'])
df_filt.shape[0]

127774

In [235]:
#  Consultamos el tipo de datos que tenemos en el dataset
df_filt.dtypes

user_name           object
user_location       object
user_description    object
user_created        object
user_followers       int64
user_friends         int64
user_favourites      int64
user_verified         bool
date                object
text                object
hashtags            object
source              object
is_retweet            bool
dtype: object

Para lo que nosotros vamos a analizar que es si el tweet esta relacionado con el **Covid19** solo nos vamos a quedar con los campos que puedan tener informacion asociada como en este caso son  la **fecha**, el **texto** y los **hashtags**.

In [236]:
columnas_drop = ['user_name',
                'user_location',
                'user_description',
                'user_created',
                'user_followers',
                'user_friends',
                'user_favourites',
                'user_verified',
                'date',
                'source',
                'is_retweet']

In [237]:
df_filt2 = df_filt.drop(columns=columnas_drop)
print("Tamaño del Dataframe filtrado: ", df_filt.shape)
print("Tamaño del DataFrame nuevo: ", df_filt2.shape)

Tamaño del Dataframe filtrado:  (127774, 13)
Tamaño del DataFrame nuevo:  (127774, 2)


In [238]:
df_filt2.dtypes.value_counts()

object    2
Name: count, dtype: int64

In [239]:
#  Los hashtags más usados en los tweets son: 
df_filt2['hashtags'].value_counts().head(10)

hashtags
['COVID19']                                                               37792
['Covid19']                                                                4834
['covid19']                                                                3124
['coronavirus', 'CoronaVirusUpdate', 'COVID19', 'CoronavirusPandemic']      624
['coronavirus']                                                             550
['COVID19', 'coronavirus']                                                  519
['Coronavirus', 'COVID19']                                                  503
['coronavirus', 'COVID19']                                                  491
['CoronaVirusUpdates', 'COVID19']                                           319
['Coronavirus']                                                             262
Name: count, dtype: int64

In [240]:
#  Creamos una funcion que detecte si un hashtag tienen una palabra igual a covid o coronavirus y parecido 
def covid(x):
    if 'covid19' in x.lower() or 'coronavirus' in x.lower() or 'covid' in x.lower():
        return 1
    else:
        return 0

In [241]:
#  Aplicamos la funcion a la columna hashtags y lo guardamos en una nueva columna
df_filt2['hashtags_covid'] = df_filt2['hashtags'].apply(covid)
df_filt2

Unnamed: 0,text,hashtags,hashtags_covid
2,@diane3443 @wdunlap @realDonaldTrump Trump nev...,['COVID19'],1
3,@brookbanktv The one gift #COVID19 has give me...,['COVID19'],1
4,25 July : Media Bulletin on Novel #CoronaVirus...,"['CoronaVirusUpdates', 'COVID19']",1
5,#coronavirus #covid19 deaths continue to rise....,"['coronavirus', 'covid19']",1
6,How #COVID19 Will Change Work in General (and ...,"['COVID19', 'Recruiting']",1
...,...,...,...
179101,Wallkill school nurse adds COVID-19 monitoring...,"['nurses', 'COVID19', 'coronavirus', 'schools']",1
179102,"we have reached 25mil cases of #covid19, world...",['covid19'],1
179103,Thanks @IamOhmai for nominating me for the @WH...,['WearAMask'],0
179104,2020! The year of insanity! Lol! #COVID19 http...,['COVID19'],1


In [242]:
df_filt2['hashtags_covid'].value_counts()

hashtags_covid
1    103373
0     24401
Name: count, dtype: int64

In [243]:
#  Ahora borramos los tweets que tienen valor 0 en la columna covid
df_filt3 = df_filt2[df_filt2['hashtags_covid'] == 1]
df_filt3

Unnamed: 0,text,hashtags,hashtags_covid
2,@diane3443 @wdunlap @realDonaldTrump Trump nev...,['COVID19'],1
3,@brookbanktv The one gift #COVID19 has give me...,['COVID19'],1
4,25 July : Media Bulletin on Novel #CoronaVirus...,"['CoronaVirusUpdates', 'COVID19']",1
5,#coronavirus #covid19 deaths continue to rise....,"['coronavirus', 'covid19']",1
6,How #COVID19 Will Change Work in General (and ...,"['COVID19', 'Recruiting']",1
...,...,...,...
179100,Wallkill school nurse adds COVID-19 monitoring...,"['nurses', 'COVID19', 'coronavirus', 'schools']",1
179101,Wallkill school nurse adds COVID-19 monitoring...,"['nurses', 'COVID19', 'coronavirus', 'schools']",1
179102,"we have reached 25mil cases of #covid19, world...",['covid19'],1
179104,2020! The year of insanity! Lol! #COVID19 http...,['COVID19'],1


In [244]:
#  Borramos las columnas text y hashtags porque no las vamos a usar
columnas_drop = ['text', 'hashtags']
df_filt4 = df_filt3.drop(columns=columnas_drop)
df_filt4

Unnamed: 0,hashtags_covid
2,1
3,1
4,1
5,1
6,1
...,...
179100,1
179101,1
179102,1
179104,1


In [245]:
#  Aplicamos la funcion anterior llamada covid a la columna text y lo guardamos en una nueva columna
df_filt4.loc[:, 'text_covid'] = df_filt['text'].apply(covid)
df_filt4

Unnamed: 0,hashtags_covid,text_covid
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
...,...,...
179100,1,1
179101,1,1
179102,1,1
179104,1,1


In [246]:
df_filt4.dtypes.value_counts()

int64    2
Name: count, dtype: int64