### 1 & 2. Collecting, Saving Data, and basic Exploratory Data Analysis (EDA)

In [1]:
from os import listdir
from os.path import isfile, join

# get the list of file names from the data path
data_path = 'data'
files = [file for file in listdir(data_path) if isfile(join(data_path, file))]
files

['altcoin.csv',
 'APPL.csv',
 'bitcoin.csv',
 'coindesk.csv',
 'Cryptocurrency.csv',
 'Gold.csv',
 'GOOG.csv',
 'YHOO.csv']

In [2]:
import pandas as pd

# function to read multiple csv files from a path into a list of dataframes
def read_csvs_from_path(data_path : str):
    # get list of file names
    files = [file for file in listdir(data_path) if isfile(join(data_path, file))]
    dfs = []
    
    # loop through files, read the csv file and save the dataframe
    for i, file in enumerate(files):
        df = pd.read_csv(data_path + "/" + file)
        print(str(i+1) + ". Sample data extracted from: '" + file + "'")
        display(df.tail(1))
        dfs.append(df)
        
    # return the list of dataframes
    return dfs

dfs = read_csvs_from_path(data_path)

1. Sample data extracted from: 'altcoin.csv'


Unnamed: 0,Datetime,Tweet Id,Text,URL,User
5000,2023-03-09 19:03:33+00:00,1633906384160034817,🔔 Public Company Accounting Oversight Board (P...,https://twitter.com/Altcoin_Alerts/status/1633...,https://twitter.com/Altcoin_Alerts


2. Sample data extracted from: 'APPL.csv'


Unnamed: 0,Datetime,Tweet Id,Text,URL,User
3180,2023-03-05 00:03:31+00:00,1632169937350303744,Apple’s Approval Process Delays Uniswap’s Mobi...,https://twitter.com/ZhotCrypto/status/16321699...,https://twitter.com/ZhotCrypto


3. Sample data extracted from: 'bitcoin.csv'


Unnamed: 0,Datetime,Tweet Id,Text,URL,User
5000,2023-03-10 22:03:06+00:00,1634313959904886786,@JulesXavier9 Obama didn’t take office until 2...,https://twitter.com/btc_liberates/status/16343...,https://twitter.com/btc_liberates


4. Sample data extracted from: 'coindesk.csv'


Unnamed: 0,Datetime,Tweet Id,Text,URL,User
5000,2023-03-06 04:06:30+00:00,1632593471042142213,@milkyway16eth source coindesk - Arca’s Hotz ...,https://twitter.com/hashtronaut207/status/1632...,https://twitter.com/hashtronaut207


5. Sample data extracted from: 'Cryptocurrency.csv'


Unnamed: 0,Datetime,Tweet Id,Text,URL,User
5000,2023-03-10 14:44:37+00:00,1634203611403022336,Silvergate and Cryptocurrency😣😣,https://twitter.com/tamoinam/status/1634203611...,https://twitter.com/tamoinam


6. Sample data extracted from: 'Gold.csv'


Unnamed: 0,Datetime,Tweet Id,Text,URL,User
5000,2023-03-10 22:37:01+00:00,1634322493056905217,Check out Vintage Bracelet Gold Tone Pink Ston...,https://twitter.com/ShopThar/status/1634322493...,https://twitter.com/ShopThar


7. Sample data extracted from: 'GOOG.csv'


Unnamed: 0,Datetime,Tweet Id,Text,URL,User
5000,2023-03-05 12:01:20+00:00,1632350581569490946,Get instant updates and free trials join here ...,https://twitter.com/Smith28301/status/16323505...,https://twitter.com/Smith28301


8. Sample data extracted from: 'YHOO.csv'


Unnamed: 0,Datetime,Tweet Id,Text,URL,User
3643,2023-03-05 00:00:48+00:00,1632169253079072768,@NguboAyimbathwa @ChristoThurston @ThuliMadons...,https://twitter.com/Bright_Afrika/status/16321...,https://twitter.com/Bright_Afrika


In [3]:
# function to verify a parameter type
def verify_parameter(param, param_name, type_):
    if not isinstance(param, type_):
        raise ValueError("Parameter '" + param_name + "' must be a " + str(type_))

In [4]:
# function to print a simple line of chars
def print_lines(line:str = '-', n:int = 40):    
    # verify parameters type
    verify_parameter(line, 'line', str)
    verify_parameter(n, 'n', int)

    # print the line
    print(line * n)

In [5]:
class EDA:
    def display_shape_and_colnames_df(self, df:pd.DataFrame, return_shape:bool = True):    
        # verify parameters type
        verify_parameter(df, 'df', pd.DataFrame)
        verify_parameter(return_shape, 'return_shape', bool)
            
        # display dataframe shape and column names
        print("Dataframe Rows:", df.shape[0])
        print("Dataframe Columns:", df.shape[1])
        print("Column names:", df.columns.to_list())
        
        # return the shape if needed
        if return_shape: return df.shape
        
    def display_shape_and_colnames_dfs(self, dfs:list, names:list):
        # verify parameters type
        verify_parameter(dfs, 'dfs', list)
        verify_parameter(names, 'names', list)
        assert len(names) == len(dfs), "'dfs' and 'names' must have same length"
        
        # display 
        total_rows = 0
        for i, df in enumerate(dfs):
            print_lines()
            print("Dataframe from file:", names[i])
            df_shape = self.display_shape_and_colnames_df(df)
            total_rows += df_shape[0]
        print_lines(line='=')
        print("TOTAL ROWS:", total_rows)
        
eda = EDA()
eda.display_shape_and_colnames_dfs(dfs, files)

----------------------------------------
Dataframe from file: altcoin.csv
Dataframe Rows: 5001
Dataframe Columns: 5
Column names: ['Datetime', 'Tweet Id', 'Text', 'URL', 'User']
----------------------------------------
Dataframe from file: APPL.csv
Dataframe Rows: 3181
Dataframe Columns: 5
Column names: ['Datetime', 'Tweet Id', 'Text', 'URL', 'User']
----------------------------------------
Dataframe from file: bitcoin.csv
Dataframe Rows: 5001
Dataframe Columns: 5
Column names: ['Datetime', 'Tweet Id', 'Text', 'URL', 'User']
----------------------------------------
Dataframe from file: coindesk.csv
Dataframe Rows: 5001
Dataframe Columns: 5
Column names: ['Datetime', 'Tweet Id', 'Text', 'URL', 'User']
----------------------------------------
Dataframe from file: Cryptocurrency.csv
Dataframe Rows: 5001
Dataframe Columns: 5
Column names: ['Datetime', 'Tweet Id', 'Text', 'URL', 'User']
----------------------------------------
Dataframe from file: Gold.csv
Dataframe Rows: 5001
Dataframe Col

In [6]:
# applying a vertical concat on all our datasets
df = pd.concat(dfs, axis=0).reset_index(drop=True)
df.tail()

Unnamed: 0,Datetime,Tweet Id,Text,URL,User
36826,2023-03-05 00:14:38+00:00,1632172734581972992,Khosi did what 😂😂😂😤😤😤😤 yhoo ha.a Nana is dange...,https://twitter.com/MzuzwanaYamkela/status/163...,https://twitter.com/MzuzwanaYamkela
36827,2023-03-05 00:13:20+00:00,1632172404909789184,Yhoo Nana what is your business with Thabang y...,https://twitter.com/MzuzwanaYamkela/status/163...,https://twitter.com/MzuzwanaYamkela
36828,2023-03-05 00:07:45+00:00,1632170999515869184,Yes yhoo,https://twitter.com/Lissssativa/status/1632170...,https://twitter.com/Lissssativa
36829,2023-03-05 00:03:14+00:00,1632169864218435585,Yhoo I was about to go to bed 😭😭 #BBTitans,https://twitter.com/its_sommy/status/163216986...,https://twitter.com/its_sommy
36830,2023-03-05 00:00:48+00:00,1632169253079072768,@NguboAyimbathwa @ChristoThurston @ThuliMadons...,https://twitter.com/Bright_Afrika/status/16321...,https://twitter.com/Bright_Afrika


In [7]:
eda.display_shape_and_colnames_df(df, return_shape=False)

Dataframe Rows: 36831
Dataframe Columns: 5
Column names: ['Datetime', 'Tweet Id', 'Text', 'URL', 'User']


In [8]:
class EDA2():
    def check_missing_values_df(self, df : pd.DataFrame):
        # verify parameters type
        verify_parameter(df, 'df', pd.DataFrame)
        
        # display missing values per column
        print_lines(n=30, line='=')
        print("# Missing values per column:")
        display(df.isna().sum())
        print_lines(n=30)
        print("% Missing values per column:")
        display(df.isna().mean() * 100)
        print_lines(n=30, line='=')
    
    def check_duplicated_rows_df(self, df : pd.DataFrame):
        # verify parameters type
        verify_parameter(df, 'df', pd.DataFrame)
        
        # display duplicates from the dataset
        print_lines(n=30, line='=')
        print("# Duplicated rows:", df.duplicated().sum())
        print_lines(n=30)
        print("% Duplicated rows:", round(df.duplicated().mean() * 100, 2))
        print_lines(n=30, line='=')
    
    def check_unique_values_df(self,df : pd.DataFrame):
        # verify parameters type
        verify_parameter(df, 'df', pd.DataFrame)
            
        # display unique values per column
        print_lines(n=30, line='=')
        print("# Unique values per column:")
        display(df.nunique())
        print_lines(n=30)
        print("% Unique values per column (relative to total rows):")
        display(round(df.nunique()*100 / df.shape[0], 2))
        print_lines(n=30, line='=')
    
eda = EDA2()
eda.check_missing_values_df(df)
eda.check_duplicated_rows_df(df)
eda.check_unique_values_df(df)

# Missing values per column:


Datetime    0
Tweet Id    0
Text        0
URL         0
User        0
dtype: int64

------------------------------
% Missing values per column:


Datetime    0.0
Tweet Id    0.0
Text        0.0
URL         0.0
User        0.0
dtype: float64

# Duplicated rows: 460
------------------------------
% Duplicated rows: 1.25
# Unique values per column:


Datetime    29029
Tweet Id    36371
Text        35661
URL         36371
User        18564
dtype: int64

------------------------------
% Unique values per column (relative to total rows):


Datetime    78.82
Tweet Id    98.75
Text        96.82
URL         98.75
User        50.40
dtype: float64



### 3. Cleaning Data

In [9]:
import string
import re

class CleanData():
    
    def remove_duplicates_df(self, df : pd.DataFrame, reset_index : bool = True):
        # verify parameters type
        verify_parameter(df, 'df', pd.DataFrame)
        verify_parameter(reset_index, 'reset_index', bool)
        
        # perform the duplicate removal on a copy of the dataframe
        df_copy = df.copy()
        
        # check duplicates before and after removal
        eda = EDA2()
        print("Before Removal:")
        eda.check_duplicated_rows_df(df_copy)
        
        # actually remove the duplicates and reset index if specified
        df_copy.drop_duplicates(inplace=True)
        if reset_index: df_copy.reset_index(drop=True, inplace=True)
        
        # check duplicates before and after removal
        print("After Removal:")
        eda.check_duplicated_rows_df(df_copy)
        
        return df_copy
    

    def remove_punctuation_string(self, text : str):
        # verify parameters type
        verify_parameter(text, 'text', str)
         
        # return the string value but without punctuation
        return text.translate(str.maketrans('', '', string.punctuation))
        
    def remove_numbers_string(self, text : str):
        # verify parameters type
        verify_parameter(text, 'text', str)
        
        # return the string value but without numbers
        return re.sub(r'\d+', '', text)
    
cd = CleanData()

In [10]:
# remove duplicates
df = cd.remove_duplicates_df(df)
# remove punctuation in Text column
df['Text'] = df['Text'].apply(cd.remove_punctuation_string)
# remove numbers in Text column
df['Text'] = df['Text'].apply(cd.remove_numbers_string)

Before Removal:
# Duplicated rows: 460
------------------------------
% Duplicated rows: 1.25
After Removal:
# Duplicated rows: 0
------------------------------
% Duplicated rows: 0.0


In [11]:
df.tail()

Unnamed: 0,Datetime,Tweet Id,Text,URL,User
36366,2023-03-05 00:14:38+00:00,1632172734581972992,Khosi did what 😂😂😂😤😤😤😤 yhoo haa Nana is danger...,https://twitter.com/MzuzwanaYamkela/status/163...,https://twitter.com/MzuzwanaYamkela
36367,2023-03-05 00:13:20+00:00,1632172404909789184,Yhoo Nana what is your business with Thabang y...,https://twitter.com/MzuzwanaYamkela/status/163...,https://twitter.com/MzuzwanaYamkela
36368,2023-03-05 00:07:45+00:00,1632170999515869184,Yes yhoo,https://twitter.com/Lissssativa/status/1632170...,https://twitter.com/Lissssativa
36369,2023-03-05 00:03:14+00:00,1632169864218435585,Yhoo I was about to go to bed 😭😭 BBTitans,https://twitter.com/its_sommy/status/163216986...,https://twitter.com/its_sommy
36370,2023-03-05 00:00:48+00:00,1632169253079072768,NguboAyimbathwa ChristoThurston ThuliMadonsela...,https://twitter.com/Bright_Afrika/status/16321...,https://twitter.com/Bright_Afrika


In [12]:
from nltk import word_tokenize

class CleanData2():
    
    def get_words_len_n(self, df : pd.DataFrame, col : str, n : int, display : bool = False, return_list : bool = True):
        # verify parameters type
        verify_parameter(df, 'df', pd.DataFrame)
        verify_parameter(col, 'col', str)
        verify_parameter(n, 'n', int)
        verify_parameter(display, 'display', bool)
        verify_parameter(return_list, 'return_list', bool)
        
        # initialize list of words of len n
        words = []

        # loop through unique values of the specified column
        for unique in df[col].unique():
            
            # get unique words len n from tokens of unique
            words += [word for word in word_tokenize(unique) if len(word) == n]
        
        # sort and delete repeated values in the list
        words = sorted(list(set(words)))
        
        if display:
            print("Words of length", n, "from column '" + col + "'")
            print(words)
        if return_list: return words
    
    def remove_words_df_col(self, df : pd.DataFrame, col : str, words : list):
        # verify parameters type
        verify_parameter(df, 'df', pd.DataFrame)
        verify_parameter(col, 'col', str)
        verify_parameter(words, 'words', list)
        
        # perform removal on a copy of the dataframe
        df_copy = df.copy()
        
        # loop through unique values of the column
        for unique in df[col].unique():
            # tokenize the unique value from the column
            tokens = word_tokenize(unique)

            # remove the words from the tokens
            tokens = [token for token in tokens if token not in words]
            
            # replace the new value in the df column
            df_copy[col].replace(unique, ' '.join(tokens), inplace=True)
        
        return df_copy
    
    def remove_chars_df_col(self, df : pd.DataFrame, col : str, chars : list):
        # verify parameters type
        verify_parameter(df, 'df', pd.DataFrame)
        verify_parameter(col, 'col', str)
        verify_parameter(chars, 'chars', list)
        
        # perform removal on a copy of the dataframe
        df_copy = df.copy()
        
        # remove all chars from the column values
        for char in chars:
            df_copy[col] = df_copy[col].str.replace(char, '', regex=False)
        
        return df_copy

cd2 = CleanData2()

In [13]:
# save and display words of length 1 from column Text
words = cd2.get_words_len_n(df, 'Text', 1, display=True)

Words of length 1 from column 'Text'
['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '¡', '£', '¥', '«', '¯', '°', '±', '´', '·', '»', '¿', 'À', 'È', 'É', 'Ö', 'à', 'á', 'å', 'è', 'é', 'ñ', 'ó', 'ü', 'ý', 'ı', '̀', '̫', '͒', 'Ξ', 'ή', 'η', 'ω', 'А', 'В', 'И', 'С', 'У', 'Я', 'а', 'в', 'з', 'и', 'о', 'п', 'с', 'у', 'я', 'і', '،', '؟', 'ب', 'د', 'ع', 'ف', 'م', 'و', 'ي', '٪', 'ۗ', 'ۚ', 'आ', 'न', 'ಠ', 'ಥ', 'ඉ', 'ด', 'อ', 'ื', '฿', 'ᐟ', 'ạ', 'ở', '\u200b', '–', '—', '‘', '’', '“', '”', '„', '•', '‣', '…', '‧', '›', '‿', '\u2060', '₦', '€', '₱', '₹', '₺', '₿', '⃣', '↑', '→', '↓', '∂', '−', '√', '∠', '⊂', '⊃', '⋆', '⌘', '⍃', '⏪', '⏫', '⏬', '⏰', '⏳', '─', '│', '┐', '║', '╥', '▅', '■', '▪', '▰', '▶', '▸', '►', '◆', '●', '◔', '◾', '★', '☆', '♟', '♡', '♥', '♦', '♪', '♫', '♾', '⚒', '⚔', '⚠

In [14]:
# remove words of length 1 from column Text
df = cd2.remove_words_df_col(df, 'Text', words)

In [15]:
df.tail()

Unnamed: 0,Datetime,Tweet Id,Text,URL,User
36366,2023-03-05 00:14:38+00:00,1632172734581972992,Khosi did what 😂😂😂😤😤😤😤 yhoo haa Nana is danger...,https://twitter.com/MzuzwanaYamkela/status/163...,https://twitter.com/MzuzwanaYamkela
36367,2023-03-05 00:13:20+00:00,1632172404909789184,Yhoo Nana what is your business with Thabang y...,https://twitter.com/MzuzwanaYamkela/status/163...,https://twitter.com/MzuzwanaYamkela
36368,2023-03-05 00:07:45+00:00,1632170999515869184,Yes yhoo,https://twitter.com/Lissssativa/status/1632170...,https://twitter.com/Lissssativa
36369,2023-03-05 00:03:14+00:00,1632169864218435585,Yhoo was about to go to bed 😭😭 BBTitans,https://twitter.com/its_sommy/status/163216986...,https://twitter.com/its_sommy
36370,2023-03-05 00:00:48+00:00,1632169253079072768,NguboAyimbathwa ChristoThurston ThuliMadonsela...,https://twitter.com/Bright_Afrika/status/16321...,https://twitter.com/Bright_Afrika


In [None]:
# get non-alphabetic characters
unique_chars = list(set(df['Text'].apply(list).sum()))
non_alphabetic_chars = [char for char in unique_chars if not char.isalpha()]
non_alphabetic_chars.remove(' ')

# remove non-alphabetic characters from column Text
df = cd2.remove_chars_df_col(df, 'Text', non_alphabetic_chars)

In [None]:
df.tail()

### 4. Visualizing Data