In [33]:
import pandas as pd
import numpy as np
import re
import sys

<h1>Loading Raw Data</h1>
<h4>Scope:</h4>
<ul>
<li>Reading CSV files</li>
<li>Merging various datasets</li>
<li>Deleting duplicate rows</li>
<li>Deleting empty columns</li>
<li>Maintaining relevant columns</li>
<li>Shuffling data</li>
<li>Reseting index of the data</li>
</ul>

In [50]:
#Loading CSV files
first = pd.read_csv('afghanistan.csv',sep='\t',engine='python',encoding='utf-16') #Setting the encoding to utf-16 prevent parseErrors
second = pd.read_csv('Belarus.csv',sep='\t',engine='python',encoding='utf-16') #The choice of python as the engine is based on preference, you can choose c
third = pd.read_csv('Ethiopia1.csv',sep='\t',engine='python',encoding='utf-16')# The csv files are tab separated hence /t
fourth = pd.read_csv('Ethiopia2.csv',sep='\t',engine='python',encoding='utf-16')

# Concatenating all the loaded files
merged_data = pd.concat([first,second,third,fourth]).\
        drop_duplicates(subset=['URL','Date','Hit Sentence']).\
            dropna(axis=1,how='all') # Deleting possible duplicate items and deleting columns with null values

#Trimming the data to maintain potential useful variables and declutering
#Variables such as Alternate Date Format and URL are maintain though they may have very low influence on the data
trimmed_data = merged_data[['Alternate Date Format',\
                            'Twitter Followers','Twitter Following',\
                            'Reach','Country','Hit Sentence','URL']].\
                                sample(frac=1).\
                                    reset_index(drop=True) #shuffling data to mix theme and reseting index of dataframe

<h1>Deleting non-UTF-8 characters</h1>

In [52]:
trimmed_data['Hit Sentence'] = [bytes(tweet, 'utf-8').decode('utf-8', 'ignore') for tweet in trimmed_data['Hit Sentence']]

<h1>Helper Class for further optional cleaning</h1>
<h4>Scope</h4>
<ul>
<li>
Standardizing twitter accounts mentioned in tweets
</li>
<li>
Removing countries used in the data extraction
</li>
</ul>

In [76]:
class firstProcess:

    def mentionsStandardization(self,data:pd.DataFrame,tweet_column='Hit Sentence'):
        """[This method converts every tweet word token that begins with @w+ into mentions ]

        Args:
            data (pd.DataFrame): [dataframe object that contains tweets]
            tweet_Column (str, optional): [The column that contains the tweets, default=Hit Sentence]

        Returns:
            [pd.DataFrame]: Returns a dataframe object
        """
        data[tweet_column] = list(map(self.mentionscapture,data[tweet_column]))
        return data


    def mentionscapture(self,tweet):
        """This method is reponsible for converting all mentions from tweets

        Args:
            tweet ([str]): [Tweets]

        Returns:
            [str]: [Returns the refined tweet]
        """
        if tweet:
            #If tweet is not empty convert every @mention to a unique word which is less likely to appear in a normal tweet
            qt_tweet = re.sub("QT \@\w+",'QUOTE_TWEET',tweet) #Converting all quoted mentions
            rt_tweet = re.sub("RT \@\w+","RE_TWEET",qt_tweet) # Converting all retweeted mentions
            refined_tweet = re.sub("\@\w+","_MENTIONS_",rt_tweet) #Converting all mentions in the tweet
            return refined_tweet
        else:
            return np.NaN

    def removingBiasedCountries(self,tweet):
        """
        This method is reponsible for removing some of the keywords used in extracting this dataset
        NB: The dataset was generated by searching for every refugee mentions that are associated with Lithuania,Belarus,Afghanistan,Ethiopia,Sudan and Tigray.
        As a result, these countries will be constant in every tweet which might affect the classifications.
        This is an optional method.
        

        Args:
            tweet ([str]): [tweets]
        Returns:
            [str]: [Returns the refined tweet]

        Todo:
            Possible to go further and strip all flags associated with this country
        """

        if tweet:
            # Removing countries and it's hashtags considering various common typos
            refined_tweet = re.sub('#?(Lithuania|Belarus|Afghanistan|Afganistan|Ethiopia|Sudan|Tigray)(\w+)?',' ',tweet,re.IGNORECASE)
            return refined_tweet
        else:
            return np.NaN

    def mainCountriesRemove(self,data:pd.DataFrame,tweet_Column='Hit Sentence'):
        """[The main method for removing countries]

        Args:
            data (pd.DataFrame): [description]
            tweet_Column (str, optional): [description]. Defaults to 'Hit Sentence'.

        """

        data[tweet_Column]=list(map(self.removingBiasedCountries,data[tweet_Column]))
        return data

In [77]:
# This is without removing countries. 
# To remove countries, replicate this line and replace trimmed_data with the new variable and
# call the method mainCountriesRemove from the firstprocess class
fully_cleaned_data = firstProcess().mentionsStandardization(trimmed_data)
fully_cleaned_data.to_excel('cleaned_data.xlsx',index=False)