# Libraries

In [54]:
import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from pandas.api.types import is_numeric_dtype

from collections import defaultdict
from scipy.stats.stats import pearsonr

# Data Understanding
## Data informations
Displaying the dataset info before doing any operations on the data

In [55]:
df_tweets = pd.read_csv('./dataset/tweets.csv', sep=',', index_col=0)  # load tweets
df_tweets.info() #print info

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 509354017856950272 to 2312918930458324
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   user_id         971 non-null    object
 1   retweet_count   959 non-null    object
 2   reply_count     949 non-null    object
 3   favorite_count  951 non-null    object
 4   num_hashtags    913 non-null    object
 5   num_urls        947 non-null    object
 6   num_mentions    937 non-null    object
 7   created_at      1000 non-null   object
 8   text            961 non-null    object
dtypes: object(9)
memory usage: 78.1+ KB


Dropping duplicates and display changes using info()

In [56]:
df_tweets.drop_duplicates()
df_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 509354017856950272 to 2312918930458324
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   user_id         971 non-null    object
 1   retweet_count   959 non-null    object
 2   reply_count     949 non-null    object
 3   favorite_count  951 non-null    object
 4   num_hashtags    913 non-null    object
 5   num_urls        947 non-null    object
 6   num_mentions    937 non-null    object
 7   created_at      1000 non-null   object
 8   text            961 non-null    object
dtypes: object(9)
memory usage: 78.1+ KB


In [57]:
#Cleaning 'user_id'
#df_tweets['user_id'] = pd.to_numeric(df_tweets['user_id'], errors='ignore') # not needed

df_tweets = df_tweets.drop(df_tweets[pd.to_numeric(df_tweets['user_id'], errors='coerce').isnull()].index)

df_tweets['user_id'] = pd.to_numeric(df_tweets['user_id'], errors='ignore') # Olny after this instruction the column becomes numerics

int_mask = df_tweets['user_id'].apply(lambda x : pd.api.types.is_integer(x)) # to keep only the integers

df_tweets = df_tweets[int_mask]

df_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 956 entries, 509354017856950272 to 587485987414155264
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   user_id         956 non-null    int64 
 1   retweet_count   925 non-null    object
 2   reply_count     921 non-null    object
 3   favorite_count  919 non-null    object
 4   num_hashtags    892 non-null    object
 5   num_urls        912 non-null    object
 6   num_mentions    910 non-null    object
 7   created_at      956 non-null    object
 8   text            927 non-null    object
dtypes: int64(1), object(8)
memory usage: 74.7+ KB


In [58]:
#Cleaning 'user_id'
#df_tweets['user_id'] = pd.to_numeric(df_tweets['user_id'], errors='ignore') # not needed

df_tweets = df_tweets.drop(df_tweets[pd.to_numeric(df_tweets['user_id'], errors='coerce').isnull()].index)

df_tweets['user_id'] = pd.to_numeric(df_tweets['user_id'], errors='ignore') # Olny after this instruction the column becomes numerics

int_mask = df_tweets['user_id'].apply(lambda x : pd.api.types.is_integer(x)) # to keep only the integers

df_tweets = df_tweets[int_mask]

df_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 956 entries, 509354017856950272 to 587485987414155264
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   user_id         956 non-null    int64 
 1   retweet_count   925 non-null    object
 2   reply_count     921 non-null    object
 3   favorite_count  919 non-null    object
 4   num_hashtags    892 non-null    object
 5   num_urls        912 non-null    object
 6   num_mentions    910 non-null    object
 7   created_at      956 non-null    object
 8   text            927 non-null    object
dtypes: int64(1), object(8)
memory usage: 74.7+ KB


In [59]:
#Cleaning 'created_at' #da questo si può creare l'entropia dell'utente

df_tweets['created_at'] = pd.to_datetime(df_tweets['created_at'], errors='coerce')

df_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 956 entries, 509354017856950272 to 587485987414155264
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   user_id         956 non-null    int64         
 1   retweet_count   925 non-null    object        
 2   reply_count     921 non-null    object        
 3   favorite_count  919 non-null    object        
 4   num_hashtags    892 non-null    object        
 5   num_urls        912 non-null    object        
 6   num_mentions    910 non-null    object        
 7   created_at      956 non-null    datetime64[ns]
 8   text            927 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(7)
memory usage: 74.7+ KB


In [61]:
df_tweets = df_tweets.drop(df_tweets[pd.to_numeric(df_tweets['retweet_count'], errors='coerce').isnull()].index)

df_tweets['retweet_count'] = pd.to_numeric(df_tweets['retweet_count'], errors='ignore') # Only after this instruction the column becomes numerics


df_tweets = df_tweets.drop(df_tweets[pd.to_numeric(df_tweets['reply_count'], errors='coerce').isnull()].index)

df_tweets['reply_count'] = pd.to_numeric(df_tweets['reply_count'], errors='ignore') # Only after this instruction the column becomes numerics

int_mask = df_tweets['reply_count'].apply(lambda x : pd.api.types.is_integer(x)) # to keep only the integers

#df_tweets = df_tweets[int_mask]

#df_tweets['reply_count'] = pd.to_numeric(df_tweets['reply_count'], errors='ignore') # Only after this instruction the column becomes numerics


df_tweets = df_tweets.drop(df_tweets[pd.to_numeric(df_tweets['favorite_count'], errors='coerce').isnull()].index)

df_tweets['favorite_count'] = pd.to_numeric(df_tweets['favorite_count'], errors='ignore') # Only after this instruction the column becomes numerics


df_tweets = df_tweets.drop(df_tweets[pd.to_numeric(df_tweets['num_hashtags'], errors='coerce').isnull()].index)

df_tweets['num_hashtags'] = pd.to_numeric(df_tweets['num_hashtags'], errors='ignore') # Only after this instruction the column becomes numerics


df_tweets = df_tweets.drop(df_tweets[pd.to_numeric(df_tweets['num_urls'], errors='coerce').isnull()].index)

df_tweets['num_urls'] = pd.to_numeric(df_tweets['num_urls'], errors='ignore') # Only after this instruction the column becomes numerics


df_tweets = df_tweets.drop(df_tweets[pd.to_numeric(df_tweets['num_mentions'], errors='coerce').isnull()].index)

df_tweets['num_mentions'] = pd.to_numeric(df_tweets['num_mentions'], errors='ignore') # Only after this instruction the column becomes numerics

df_tweets[]

<class 'pandas.core.frame.DataFrame'>
Index: 845 entries, 509354017856950272 to 587485987414155264
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   user_id         845 non-null    int64         
 1   retweet_count   845 non-null    int64         
 2   reply_count     845 non-null    float64       
 3   favorite_count  845 non-null    float64       
 4   num_hashtags    845 non-null    float64       
 5   num_urls        845 non-null    float64       
 6   num_mentions    845 non-null    int64         
 7   created_at      845 non-null    datetime64[ns]
 8   text            844 non-null    object        
dtypes: datetime64[ns](1), float64(4), int64(3), object(1)
memory usage: 98.3+ KB
