In [None]:
#import libraries
import pandas as pd
import numpy as np
import snscrape.modules.twitter as sntwitter
import re # In-built regular expressions library
import string
import nltk

# Natural Language Processing Toolkit
from nltk.corpus import stopwords, words # get stopwords from NLTK library & get all words in english language
from nltk.tokenize import word_tokenize # to create word tokens
# from nltk.stem import PorterStemmer (I played around with Stemmer and decided to use Lemmatizer instead)
from nltk.stem import WordNetLemmatizer # to reduce words to orginal form
from nltk import pos_tag # For Parts of Speech tagging

from textblob import TextBlob # TextBlob - Python library for processing textual data

# WordCloud - Python linrary for creating image wordclouds
from wordcloud import WordCloud
from emot.emo_unicode import UNICODE_EMOJI # For emojis
from emot.emo_unicode import EMOTICONS_EMO # For EMOTICONS

## Data Collection

In [None]:
#Use Snscraper to get #labourchallenge tweets by location
tweets_lab = pd.DataFrame(sntwitter.TwitterSearchScraper(
    '#LabourChallenge since:2022-07-14 until:2022-08-23').get_items())

In [None]:
#get the locations
tweets_lab['user_location'] =  tweets_lab['user'].apply(lambda x: x['location'])

In [None]:
#Use Snscraper to get #dignityinlabour tweets by location
tweets_dig = pd.DataFrame(sntwitter.TwitterSearchScraper(
    '#DignityInLabour since:2022-07-14 until:2022-08-23').get_items())

In [None]:
#get the location
tweets_dig['user_location'] =  tweets_dig['user'].apply(lambda x: x['location'])

In [None]:
#drop unnecessary columns in Labour data
col=['rawContent','user','retweetCount','likeCount','sourceLabel', 'renderedContent','replyCount','quoteCount','conversationId', 'lang', 'source', 'sourceUrl',
         'links','media', 'retweetedTweet', 'quotedTweet', 'inReplyToTweetId','inReplyToUser', 'mentionedUsers',
        'cashtags', 'card','url', 'place','hashtags', 'date']
tweets_lab.drop(columns=col, inplace=True)
tweets_lab

In [None]:
#drop unnecessary columns in dignity data
col=['rawContent','user','retweetCount','likeCount','sourceLabel', 'renderedContent','replyCount','quoteCount','conversationId', 'lang', 'source', 'sourceUrl',
         'links','media', 'retweetedTweet', 'quotedTweet', 'inReplyToTweetId','inReplyToUser', 'mentionedUsers',
        'cashtags', 'card','url', 'place','hashtags', 'date']
tweets_dig.drop(columns=col, inplace=True)
tweets_dig

In [None]:
# Creating list to append tweet data to by location - Nigeria
data = []
# Using TwitterSearchScraper to scrape data and append tweets to list
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('#LabourChallenge since:2022-07-14 until:2022-08-23').get_items()):
    if i>30000:
        break
    data.append([tweet.user.username, tweet.date, tweet.likeCount, tweet.sourceLabel, tweet.content, tweet.id, tweet.retweetCount])

In [None]:
# Creating a dataframe to load the list
df1 = pd.DataFrame(data, columns=["User", "Date Created", "Number of Likes", "Source of Tweet", "Tweet","id","Number of Retweets"])

In [None]:
# Creating list to append tweet data to by location - Nigeria
datas = []
# Using TwitterSearchScraper to scrape data and append tweets to list
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('#DignityInLabour since:2022-07-14 until:2022-08-23').get_items()):
    if i>30000:
        break
    datas.append([tweet.user.username, tweet.date, tweet.likeCount, tweet.sourceLabel, tweet.content, tweet.id, tweet.retweetCount])

In [None]:
# Creating a dataframe to load the list
df2 = pd.DataFrame(datas, columns=["User", "Date Created", "Number of Likes", "Source of Tweet", "Tweet","id","Number of Retweets"])

In [None]:
#merge data on common column
labour=pd.merge(tweets_lab,df1, on='id')
print(labour.shape)
labour.head()

In [None]:
#merge data on common column
dignity=pd.merge(tweets_dig,df2, on='id')
print(dignity.shape)
dignity.head()

## Data Cleaning Process

In [None]:
#rename id, user_location and coordinates columns
labour.rename(columns={"id" : "Tweet ID"}, inplace=True)
labour.rename(columns={"user_location" : "Location"}, inplace=True)
labour.rename(columns={"coordinates" : "Coordinates"}, inplace=True)
labour.head()

In [None]:
#rename id, user_location and coordinates columns
dignity.rename(columns={"id" : "Tweet ID"}, inplace=True)
dignity.rename(columns={"user_location" : "Location"}, inplace=True)
dignity.rename(columns={"coordinates" : "Coordinates"}, inplace=True)
dignity.head()

In [None]:
#rearrange columns
cols=['User','Tweet ID','Tweet','Number of Likes','Number of Retweets','Source of Tweet','Location',
      'Date Created','Coordinates']
labour = labour.reindex(columns=cols)
labour.head()

In [None]:
#rearrange columns
cols=['User','Tweet ID','Tweet','Number of Likes','Number of Retweets','Source of Tweet','Location',
      'Date Created','Coordinates']
dignity = dignity.reindex(columns=cols)
dignity.head()

In [None]:
#remove all urls from link
labour['Tweet'] = labour['Tweet'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])

In [None]:
#remove all urls from link
dignity['Tweet'] = dignity['Tweet'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])

In [None]:
# write to .csv
labour.to_csv('labour.csv', encoding='utf-8', index=False)

In [None]:
# write to .csv
dignity.to_csv('dignity.csv', encoding='utf-8', index=False)

### Getting Necessary Columns and Data

In [None]:
# define a funtion to get all nouns from the texts
def getNouns(tweet):
    tweet = word_tokenize(tweet)  # convert string to tokens
    tweet = [word for (word, tag) in pos_tag(tweet)
             if tag == 'NN' or tag == 'NNP' or tag == 'JJ']  # pos_tag module in NLTK library
    return " ".join(tweet)  # join words with a space in between them

In [None]:
#test the function
text ='#LabourChallenge my name is segun akinsola screeding and painting and pop celling may God bless my ✋ work amen. Pls I need work ooh today is making me 2month and two weeks if you have work for me pls call me 08034653648'
getNouns(text)

In [None]:
#apply the function to a new column in the labour dataframe
labour['Processed_Tweets'] = labour['Tweet'].apply(getNouns)

In [None]:
#apply the function to a new column in the dignity dataframe
dignity['Processed_Tweets'] = dignity['Tweet'].apply(getNouns)

In [None]:
#convert the processed_tweets column to lowercase
labour['Processed_Tweets'] = labour['Processed_Tweets'].str.lower()
labour['Processed_Tweets']

In [None]:
#convert the processed_tweets column to lowercase
dignity['Processed_Tweets'] = dignity['Processed_Tweets'].str.lower()
dignity['Processed_Tweets']

In [None]:
#set stop_words and custom words to search for and apply in the Processed_tweets column into a new Career column
stop = list(stopwords.words('english'))
words= ['engineer', 'farmer','lawyer','manager','baker','caterer','student','painting','software','civil',
      'engineer', 'project', 'manager', 'petroleum','making','soap', 'sailor', 'pilot', 'investment', 'dentist',
       'software', 'developer', 'accountant','entrepreneur','entrepreneur','designer','designer', 'professional',
        'printer', 'trader','paints','legal','freelance','nurse','surgeon','zoologist',
      'optometrist','fashion','fashion','fashion designer','guitar','music', 'tech','startup','start-up',
      'truck driver','driver','truck','owner','business','market','cook','cooking','pharmacist',
       'electrician', 'mechanic', 'machinist','medical', 'doctor','banker','lecturer','lecturer','project manager',
       'pharmacist','it', 'security', 'analyst', 'ethical', 'hackers','programmers',
       'insurance', 'agent', 'human', 'capital', 'developer','hustler','photographer','craft','borehole', 'technician',
       'educator','dealer','dealer','chemical','chemical','photography','business','service','marketer',
       'cosmetics','clay','sculptor','clay sculptor','project','graphic designer','graphic','orientator',
       'consultant','retailer','teacher','actress','acting','own','cleaning','agency','footballer',
       'footballer','architect','architect','realtor','realtor','biomedical','duvets','genticist','project','graphic designer','graphic','orientator',
       'vendor','preacher', 'builder' , 'writer', 'skilled','drummer','service','businesswoman','businessman','linguist','chef',
       'artist','student','students','business woman','water treatment','freelance','geophysicist','brand','filmmaker','surveyor',
       'locksmith','nurse','tailor','brand owner','painter','plumbing','shoemaker','graduate','cinematographer','truck','videographer',
       'graphics','ui/ux','ui/ux','broker','economist','veterinarian','consultant','labourer', 'event','manager','sonographer',
        'profession','marketing','stylist','translator','supplier','trainee','digital','marketer','geologist','planner','event',
       'driver','butcherman','business man','artisan','physiotherapist','specialist','barber','scientist','driver']
labour['Career'] = labour['Processed_Tweets'].apply(lambda x: ','.join([word for word in x.split() if word in (words)]))
labour['Career']

In [None]:
#set stop_words and custom words to search for and apply in the Processed_tweets column into a new Career column
stop = list(stopwords.words('english'))
words= ['engineer', 'farmer','lawyer','manager','baker','caterer','student','painting','software','civil',
      'engineer', 'project', 'manager', 'petroleum','making','soap', 'sailor', 'pilot', 'investment', 'dentist',
       'software', 'developer', 'accountant','entrepreneur','entrepreneur','designer','designer', 'professional',
        'printer', 'trader','paints','legal','freelance','nurse','surgeon','zoologist',
      'optometrist','fashion','fashion','fashion designer','guitar','music', 'tech','startup','start-up',
      'truck driver','driver','truck','owner','business','market','cook','cooking','pharmacist',
       'electrician', 'mechanic', 'machinist','medical', 'doctor','banker','lecturer','lecturer','project manager',
       'pharmacist','it', 'security', 'analyst', 'ethical', 'hackers','programmers',
       'insurance', 'agent', 'human', 'capital', 'developer','hustler','photographer','craft','borehole', 'technician',
       'educator','dealer','dealer','chemical','chemical','photography','business','service','marketer',
       'cosmetics','clay','sculptor','clay sculptor','project','graphic designer','graphic','orientator',
       'consultant','retailer','teacher','actress','acting','own','cleaning','agency','footballer',
       'footballer','architect','architect','realtor','realtor','biomedical','duvets','genticist','project','graphic designer','graphic','orientator',
       'vendor','preacher', 'builder' , 'writer', 'skilled','drummer','service','businesswoman','businessman','linguist','chef',
       'artist','student','students','business woman','water treatment','freelance','geophysicist','brand','filmmaker','surveyor',
       'locksmith','nurse','tailor','brand owner','painter','plumbing','shoemaker','graduate','cinematographer','truck','videographer',
       'graphics','ui/ux','ui/ux','broker','economist','veterinarian','consultant','labourer', 'event','manager','sonographer',
        'profession','marketing','stylist','translator','supplier','trainee','digital','marketer','geologist','planner','event',
       'driver','butcherman','business man','artisan','physiotherapist','specialist','barber','scientist','driver']
dignity['Career'] = dignity['Processed_Tweets'].apply(lambda x: ','.join([word for word in x.split() if word in (words)]))
dignity['Career']

In [None]:
labour.Career

In [None]:
#find null/empty values
sum(labour['Career'] == '')

In [None]:
#find null/empty values
sum(dignity['Career'] == '')

In [None]:
#replace empty strings in Career column with Np.nan values
labour.Career = labour['Career'].replace(r'^\s*$', np.NaN, regex=True)

In [None]:
#replace empty strings in Career column with Np.nan values
dignity.Career = dignity['Career'].replace(r'^\s*$', np.NaN, regex=True)

In [None]:
#find Career and replace with preferred name
labour.loc[labour['Career'].str.contains('graduate,graduate',na=False), 'Career']= 'graduate'
labour.loc[labour['Career'].str.contains('soap',na=False), 'Career']= 'soap maker'
labour.loc[labour['Career'].str.contains('truck,driver',na=False), 'Career']= 'truck driver'
labour.loc[labour['Career'].str.contains('marketer,owner',na=False), 'Career']= 'marketer'
labour.loc[labour['Career'].str.contains('nurse',na=False), 'Career']='nurse'
labour.loc[labour['Career'].str.contains('surgeon',na=False), 'Career']='surgeon'
labour.loc[labour['Career'].str.contains('zoologist',na=False), 'Career']='zoologist'
labour.loc[labour['Career'].str.contains('painter',na=False), 'Career']= 'painter'
labour.loc[labour['Career'].str.contains('caterer',na=False), 'Career']= 'caterer'
labour.loc[labour['Career'].str.contains('pharmacist',na=False), 'Career']= 'pharmacist'
labour.loc[labour['Career'].str.contains('sales,representative',na=False), 'Career']= 'sales representative'
labour.loc[labour['Career'].str.contains('hustler',na=False), 'Career']= 'hustler'
labour.loc[labour['Career'].str.contains('student',na=False), 'Career']= 'student'
labour.loc[labour['Career'].str.contains('painting',na=False), 'Career']= 'painter'
labour.loc[labour['Career'].str.contains('business,farmer',na=False), 'Career'] ='farmer'
labour.loc[labour['Career'].str.contains('tech,startup,service',na=False), 'Career']='tech startup'
labour.loc[labour['Career'].str.contains('truck',na=False), 'Career'] ='truck driver'
labour.loc[labour['Career'].str.contains('manager,marketing,manager,nurse,medical',na=False), 'Career']= 'nurse'
labour.loc[labour['Career'].str.contains('medical',na=False), 'Career']='doctor'
labour.loc[labour['Career'].str.contains('bioMedical',na=False), 'Career']='biomedical engineer'
labour.loc[labour['Career'].str.contains('cooking,human',na=False), 'Career']='cook'
labour.loc[labour['Career'].str.contains('writer,stylist',na=False), 'Career']='writer'
labour.loc[labour['Career'].str.contains('stylist',na=False), 'Career']='stylist'
labour.loc[labour['Career'].str.contains('tailor',na=False), 'Career']='tailor'
labour.loc[labour['Career'].str.contains('realtor',na=False), 'Career']='realtor'
labour.loc[labour['Career'].str.contains('fashion',na=False), 'Career']='fashion designer'
labour.loc[labour['Career'].str.contains('insurance',na=False), 'Career']='insurance agent'
labour.loc[labour['Career'].str.contains('vendor',na=False), 'Career']='vendor'
labour.loc[labour['Career'].str.contains('event',na=False), 'Career']='event planner'
labour.loc[labour['Career'].str.contains('graduate',na=False), 'Career']='graduate'
labour.loc[labour['Career'].str.contains('video',na=False), 'Career']='videographer'
labour.loc[labour['Career'].str.contains('photo',na=False), 'Career']='photographer'
labour.loc[labour['Career'].str.contains('student',na=False), 'Career']='student'
labour.loc[labour['Career'].str.contains('technician',na=False), 'Career']='technician'
labour.loc[labour['Career'].str.contains('dealer',na=False), 'Career']='chemical dealer'
labour.loc[labour['Career'].str.contains('chemical,engineer',na=False), 'Career']='engineer'
labour.loc[labour['Career'].str.contains('labour',na=False), 'Career']='labourer'
labour.loc[labour['Career'].str.contains('cleaning',na=False), 'Career']='cleaner'
labour.loc[labour['Career'].str.contains('designer',na=False), 'Career']='designer'
labour.loc[labour['Career'].str.contains('farmer',na=False), 'Career']='farmer'
labour.loc[labour['Career'].str.contains('project',na=False), 'Career']='project manager'
labour.loc[labour['Career'].str.contains('sculptor',na=False), 'Career']='sculptor'
labour.loc[labour['Career'].str.contains('accountant',na=False), 'Career']='accountant'
labour.loc[labour['Career'].str.contains('baker',na=False), 'Career']='baker'
labour.loc[labour['Career'].str.contains('marketer',na=False), 'Career']='marketer'
labour.loc[labour['Career'].str.contains('marketing',na=False), 'Career']='marketer'
labour.loc[labour['Career'].str.contains('architect',na=False), 'Career']='architect'
labour.loc[labour['Career'].str.contains('engineer',na=False), 'Career']='engineer'
labour.loc[labour['Career'].str.contains('software',na=False), 'Career']= 'software engineer'
labour.loc[labour['Career'].str.contains('civil',na=False), 'Career']= 'civil engineer'
labour.loc[labour['Career'].str.contains('artist',na=False), 'Career']='artist'
labour.loc[labour['Career'].str.contains('banker',na=False), 'Career']='banker'
labour.loc[labour['Career'].str.contains('business',na=False), 'Career']='business'
labour.loc[labour['Career'].str.contains('consultant',na=False), 'Career']='consultant'
labour.loc[labour['Career'].str.contains('entrepreneur',na=False), 'Career']='entrepreneur'
labour.loc[labour['Career'].str.contains('actress',na=False), 'Career']='actress'
labour.loc[labour['Career'].str.contains('brand',na=False), 'Career']='brand owner'
labour.loc[labour['Career'].str.contains('teacher',na=False), 'Career']='teacher'
labour.loc[labour['Career'].str.contains('developer',na=False), 'Career']='developer'
labour.loc[labour['Career'].str.contains('manager',na=False), 'Career']='manager'
labour.loc[labour['Career'].str.contains('analyst',na=False), 'Career']='software analyst'
labour.loc[labour['Career'].str.contains('translator',na=False), 'Career']='translator'
labour.loc[labour['Career'].str.contains('professional',na=False), 'Career']='professional'
labour.loc[labour['Career'].str.contains('printer',na=False), 'Career']='printer'
labour.loc[labour['Career'].str.contains('printing',na=False), 'Career']='printer'
labour.loc[labour['Career'].str.contains('trader',na=False), 'Career']='trader'
labour.loc[labour['Career'].str.contains('trade',na=False), 'Career']='trader'
labour.loc[labour['Career'].str.contains('paints',na=False), 'Career']='painter'
labour.loc[labour['Career'].str.contains('legal',na=False), 'Career']='lawyer'
labour.loc[labour['Career'].str.contains('freelance',na=False), 'Career']='freelancer'

In [None]:
#check the value counts
labour.Career.value_counts()

In [None]:
#find Career and replace with preferred name
dignity.loc[dignity['Career'].str.contains('graduate,graduate',na=False), 'Career']= 'graduate'
dignity.loc[dignity['Career'].str.contains('truck,driver',na=False), 'Career']= 'truck driver'
dignity.loc[dignity['Career'].str.contains('marketer,owner',na=False), 'Career']= 'marketer'
dignity.loc[dignity['Career'].str.contains('painter',na=False), 'Career']= 'painter'
dignity.loc[dignity['Career'].str.contains('student',na=False), 'Career']= 'student'
dignity.loc[dignity['Career'].str.contains('nurse',na=False), 'Career']='nurse'
dignity.loc[dignity['Career'].str.contains('surgeon',na=False), 'Career']='surgeon'
dignity.loc[dignity['Career'].str.contains('zoologist',na=False), 'Career']='zoologist'
dignity.loc[dignity['Career'].str.contains('pharmacist',na=False), 'Career']= 'pharmacist'
dignity.loc[dignity['Career'].str.contains('painting',na=False), 'Career']= 'painter'
dignity.loc[dignity['Career'].str.contains('sales,representative',na=False), 'Career']= 'sales representative'
dignity.loc[dignity['Career'].str.contains('hustler',na=False), 'Career']= 'hustler'
dignity.loc[dignity['Career'].str.contains('caterer',na=False), 'Career']= 'caterer'
dignity.loc[dignity['Career'].str.contains('business,farmer',na=False), 'Career'] ='farmer'
dignity.loc[dignity['Career'].str.contains('tech,startup,service',na=False), 'Career']='tech startup'
dignity.loc[dignity['Career'].str.contains('truck',na=False), 'Career'] ='truck driver'
dignity.loc[dignity['Career'].str.contains('manager,marketing,manager,nurse,medical',na=False), 'Career']= 'nurse'
dignity.loc[dignity['Career'].str.contains('medical',na=False), 'Career']='doctor'
dignity.loc[dignity['Career'].str.contains('bioMedical',na=False), 'Career']='biomedical engineer'
dignity.loc[dignity['Career'].str.contains('cooking,human',na=False), 'Career']='cook'
dignity.loc[dignity['Career'].str.contains('writer,stylist',na=False), 'Career']='writer'
dignity.loc[dignity['Career'].str.contains('stylist',na=False), 'Career']='stylist'
dignity.loc[dignity['Career'].str.contains('tailor',na=False), 'Career']='tailor'
dignity.loc[dignity['Career'].str.contains('realtor',na=False), 'Career']='realtor'
dignity.loc[dignity['Career'].str.contains('fashion',na=False), 'Career']='fashion designer'
dignity.loc[dignity['Career'].str.contains('insurance',na=False), 'Career']='insurance agent'
dignity.loc[dignity['Career'].str.contains('vendor',na=False), 'Career']='vendor'
dignity.loc[dignity['Career'].str.contains('event',na=False), 'Career']='event planner'
dignity.loc[dignity['Career'].str.contains('graduate',na=False), 'Career']='graduate'
dignity.loc[dignity['Career'].str.contains('video',na=False), 'Career']='videographer'
dignity.loc[dignity['Career'].str.contains('photo',na=False), 'Career']='photographer'
dignity.loc[dignity['Career'].str.contains('student',na=False), 'Career']='student'
dignity.loc[dignity['Career'].str.contains('technician',na=False), 'Career']='technician'
dignity.loc[dignity['Career'].str.contains('dealer',na=False), 'Career']='chemical dealer'
dignity.loc[dignity['Career'].str.contains('chemical,engineer',na=False), 'Career']='engineer'
dignity.loc[dignity['Career'].str.contains('labour',na=False), 'Career']='labourer'
dignity.loc[dignity['Career'].str.contains('cleaning',na=False), 'Career']='cleaner'
dignity.loc[dignity['Career'].str.contains('designer',na=False), 'Career']='designer'
dignity.loc[dignity['Career'].str.contains('farmer',na=False), 'Career']='farmer'
dignity.loc[dignity['Career'].str.contains('project',na=False), 'Career']='project manager'
dignity.loc[dignity['Career'].str.contains('sculptor',na=False), 'Career']='sculptor'
dignity.loc[dignity['Career'].str.contains('accountant',na=False), 'Career']='accountant'
dignity.loc[dignity['Career'].str.contains('baker',na=False), 'Career']='baker'
dignity.loc[dignity['Career'].str.contains('marketer',na=False), 'Career']='marketer'
dignity.loc[dignity['Career'].str.contains('marketing',na=False), 'Career']='marketer'
dignity.loc[dignity['Career'].str.contains('architect',na=False), 'Career']='architect'
dignity.loc[dignity['Career'].str.contains('engineer',na=False), 'Career']='engineer'
dignity.loc[dignity['Career'].str.contains('software',na=False), 'Career']= 'software engineer'
dignity.loc[dignity['Career'].str.contains('civil',na=False), 'Career']= 'civil engineer'
dignity.loc[dignity['Career'].str.contains('artist',na=False), 'Career']='artist'
dignity.loc[dignity['Career'].str.contains('banker',na=False), 'Career']='banker'
dignity.loc[dignity['Career'].str.contains('business',na=False), 'Career']='business'
dignity.loc[dignity['Career'].str.contains('consultant',na=False), 'Career']='consultant'
dignity.loc[dignity['Career'].str.contains('entrepreneur',na=False), 'Career']='entrepreneur'
dignity.loc[dignity['Career'].str.contains('actress',na=False), 'Career']='actress'
dignity.loc[dignity['Career'].str.contains('brand',na=False), 'Career']='brand owner'
dignity.loc[dignity['Career'].str.contains('teacher',na=False), 'Career']='teacher'
dignity.loc[dignity['Career'].str.contains('developer',na=False), 'Career']='developer'
dignity.loc[dignity['Career'].str.contains('manager',na=False), 'Career']='manager'
dignity.loc[dignity['Career'].str.contains('analyst',na=False), 'Career']='software analyst'
dignity.loc[dignity['Career'].str.contains('translator',na=False), 'Career']='translator'
dignity.loc[dignity['Career'].str.contains('professional',na=False), 'Career']='professional'
dignity.loc[dignity['Career'].str.contains('printer',na=False), 'Career']='printer'
dignity.loc[dignity['Career'].str.contains('printing',na=False), 'Career']='printer'
dignity.loc[dignity['Career'].str.contains('trader',na=False), 'Career']='trader'
dignity.loc[dignity['Career'].str.contains('trade',na=False), 'Career']='trader'
dignity.loc[dignity['Career'].str.contains('paints',na=False), 'Career']='painter'
dignity.loc[dignity['Career'].str.contains('legal',na=False), 'Career']='lawyer'
dignity.loc[dignity['Career'].str.contains('freelance',na=False), 'Career']='freelancer'

In [None]:
#check the value counts
dignity.Career.value_counts()

In [None]:
#find states and replace with preferred name
labour.loc[labour['Location'].str.contains('Lagos'), 'Location'] = 'Lagos'
labour.loc[labour['Location'].str.contains('lagos'), 'Location'] = 'Lagos'
labour.loc[labour['Location'].str.contains('lekki'), 'Location'] = 'Lagos'
labour.loc[labour['Location'].str.contains('Abuja'), 'Location'] = 'Abuja'
labour.loc[labour['Location'].str.contains('fct'), 'Location'] = 'Abuja'
labour.loc[labour['Location'].str.contains('Federal'), 'Location'] = 'Abuja'
labour.loc[labour['Location'].str.contains('abuja'), 'Location'] = 'Abuja'
labour.loc[labour['Location'].str.contains('Ibadan'), 'Location'] = 'Oyo'
labour.loc[labour['Location'].str.contains('Oyo'), 'Location'] = 'Oyo'
labour.loc[labour['Location'].str.contains('Akwa'), 'Location'] = 'Akwa-Ibom'
labour.loc[labour['Location'].str.contains('Osun'), 'Location'] = 'Osun'
labour.loc[labour['Location'].str.contains('Bonny'), 'Location'] = 'Rivers'
labour.loc[labour['Location'].str.contains('Lekki'), 'Location'] = 'Lagos'
labour.loc[labour['Location'].str.contains('Ikorodu'), 'Location'] = 'Lagos'
labour.loc[labour['Location'].str.contains('Ilorin'), 'Location'] = 'Kwara'
labour.loc[labour['Location'].str.contains('Rumuigbo'), 'Location'] = 'Rivers'
labour.loc[labour['Location'].str.contains('Lawanson'), 'Location'] = 'Lagos'
labour.loc[labour['Location'].str.contains('Asaba'), 'Location'] = 'Delta'
labour.loc[labour['Location'].str.contains('Uyo'), 'Location'] = 'Akwa-Ibom'
labour.loc[labour['Location'].str.contains('Warri'), 'Location'] = 'Delta'
labour.loc[labour['Location'].str.contains('Lasgidi'), 'Location'] = 'Lagos'
labour.loc[labour['Location'].str.contains('Ikeja'), 'Location'] = 'Lagos'
labour.loc[labour['Location'].str.contains('Abeokuta'), 'Location'] = 'Ogun'
labour.loc[labour['Location'].str.contains('Lokoja'), 'Location'] = 'Kogi'
labour.loc[labour['Location'].str.contains('Aba'), 'Location'] = 'Abia'
labour.loc[labour['Location'].str.contains('Akure'), 'Location'] = 'Ondo'
labour.loc[labour['Location'].str.contains('Port'), 'Location'] = 'Rivers'
labour.loc[labour['Location'].str.contains('Abakaliki'), 'Location'] = 'Ebonyi'
labour.loc[labour['Location'].str.contains('Enugu'), 'Location'] = 'Enugu'
labour.loc[labour['Location'].str.contains('Awka'), 'Location'] = 'Anambra'
labour.loc[labour['Location'].str.contains('Jos'), 'Location'] = 'Plateau'
labour.loc[labour['Location'].str.contains('Abraka'), 'Location'] = 'Delta'
labour.loc[labour['Location'].str.contains('Onitsha'), 'Location'] = 'Anambra'
labour.loc[labour['Location'].str.contains('Owerri'), 'Location'] = 'Imo'
labour.loc[labour['Location'].str.contains('ibadan'), 'Location'] = 'Oyo'
labour.loc[labour['Location'].str.contains('port'), 'Location'] = 'Rivers'
labour.loc[labour['Location'].str.contains('Ondo'), 'Location'] = 'Ondo'
labour.loc[labour['Location'].str.contains('Victoria'), 'Location'] = 'Lagos'
labour.loc[labour['Location'].str.contains('Ghana'), 'Location'] = 'Ghana'
labour.loc[labour['Location'].str.contains('USA'), 'Location'] = 'USA'
labour.loc[labour['Location'].str.contains('World'), 'Location'] = 'Everywhere'
labour.loc[labour['Location'].str.contains('Earth'), 'Location'] = 'Everywhere'
labour.loc[labour['Location'].str.contains('Global'), 'Location'] = 'Everywhere'
labour.loc[labour['Location'].str.contains('NG'), 'Location'] = 'Nigeria'
labour.loc[labour['Location'].str.contains('🌍'), 'Location'] = 'Everywhere'
labour.loc[labour['Location'].str.contains('美国纽约州纽约曼哈顿华埠'), 'Location'] = 'China'
labour.loc[labour['Location'].str.contains('Ado Ekiti'), 'Location'] = 'Ekiti'
labour.loc[labour['Location'].str.contains('Yola'), 'Location'] = 'Adamawa'
labour.loc[labour['Location'].str.contains('Yola'), 'Location'] = 'Adamawa'
labour.loc[labour['Location'].str.contains('worldwide'), 'Location'] = 'Everywhere'
labour.loc[labour['Location'].str.contains('Kaduna'), 'Location'] = 'Kaduna'
labour.loc[labour['Location'].str.contains('Makurdi'), 'Location'] = 'Benue'
labour.loc[labour['Location'].str.contains('Here'), 'Location'] = 'Everywhere'
labour.loc[labour['Location'].str.contains('Near You'), 'Location'] = 'Everywhere'
labour.loc[labour['Location'].str.contains('Houston'), 'Location'] = 'USA'
labour.loc[labour['Location'].str.contains('Toronto'), 'Location'] = 'Canada'
labour.loc[labour['Location'].str.contains('Somewhere'), 'Location'] = 'Everywhere'
labour.loc[labour['Location'].str.contains('Yenagoa'), 'Location'] = 'Bayelsa'
labour.loc[labour['Location'].str.contains('Benin'), 'Location'] = 'Edo'
labour.loc[labour['Location'].str.contains('here'), 'Location'] = 'Everywhere'
labour.loc[labour['Location'].str.contains('Atlanta'), 'Location'] = 'USA'
labour.loc[labour['Location'].str.contains('Guangzhou'), 'Location'] = 'China'
labour.loc[labour['Location'].str.contains('Chicago'), 'Location'] = 'USA'
labour.loc[labour['Location'].str.contains('LAGOS'), 'Location'] = 'Lagos'
labour.loc[labour['Location'].str.contains('Calabar'), 'Location'] = 'Cross River'
labour.loc[labour['Location'].str.contains('Osogbo'), 'Location'] = 'Osun'
labour.loc[labour['Location'].str.contains('Umuahia'), 'Location'] = 'Abia'
labour.loc[labour['Location'].str.contains('Anambra'), 'Location'] = 'Anambra'
labour.loc[labour['Location'].str.contains('PORT'), 'Location'] = 'Rivers'
labour.loc[labour['Location'].str.contains('Imo'), 'Location'] = 'Imo'
labour.loc[labour['Location'].str.contains('yaba'), 'Location'] = 'Lagos'
labour.loc[labour['Location'].str.contains('owerri'), 'Location'] = 'Imo'
labour.loc[labour['Location'].str.contains('FCT, ABUJA'), 'Location'] = 'Abuja'
labour.loc[labour['Location'].str.contains('Adamawa'), 'Location'] = 'Adamawa'
labour.loc[labour['Location'].str.contains('Badagry'), 'Location'] = 'Lagos'
labour.loc[labour['Location'].str.contains('enugu'), 'Location'] = 'Enugu'
labour.loc[labour['Location'].str.contains('oyo'), 'Location'] = 'Oyo'
labour.loc[labour['Location'].str.contains('France'), 'Location'] = 'France'
labour.loc[labour['Location'].str.contains('Montréal'), 'Location'] = 'Canada'

In [None]:
#check value counts
labour.Location.value_counts()

In [None]:
#find states and replace with preferred name
dignity.loc[dignity['Location'].str.contains('Lagos'), 'Location'] = 'Lagos'
dignity.loc[dignity['Location'].str.contains('lagos'), 'Location'] = 'Lagos'
dignity.loc[dignity['Location'].str.contains('lekki'), 'Location'] = 'Lagos'
dignity.loc[dignity['Location'].str.contains('Abuja'), 'Location'] = 'Abuja'
dignity.loc[dignity['Location'].str.contains('fct'), 'Location'] = 'Abuja'
dignity.loc[dignity['Location'].str.contains('Federal'), 'Location'] = 'Abuja'
dignity.loc[dignity['Location'].str.contains('abuja'), 'Location'] = 'Abuja'
dignity.loc[dignity['Location'].str.contains('Ibadan'), 'Location'] = 'Oyo'
dignity.loc[dignity['Location'].str.contains('Oyo'), 'Location'] = 'Oyo'
dignity.loc[dignity['Location'].str.contains('Akwa'), 'Location'] = 'Akwa-Ibom'
dignity.loc[dignity['Location'].str.contains('Osun'), 'Location'] = 'Osun'
dignity.loc[dignity['Location'].str.contains('Bonny'), 'Location'] = 'Rivers'
dignity.loc[dignity['Location'].str.contains('Lekki'), 'Location'] = 'Lagos'
dignity.loc[dignity['Location'].str.contains('Ikorodu'), 'Location'] = 'Lagos'
dignity.loc[dignity['Location'].str.contains('Ilorin'), 'Location'] = 'Kwara'
dignity.loc[dignity['Location'].str.contains('Rumuigbo'), 'Location'] = 'Rivers'
dignity.loc[dignity['Location'].str.contains('Lawanson'), 'Location'] = 'Lagos'
dignity.loc[dignity['Location'].str.contains('Asaba'), 'Location'] = 'Delta'
dignity.loc[dignity['Location'].str.contains('Uyo'), 'Location'] = 'Akwa-Ibom'
dignity.loc[dignity['Location'].str.contains('Warri'), 'Location'] = 'Delta'
dignity.loc[dignity['Location'].str.contains('Lasgidi'), 'Location'] = 'Lagos'
dignity.loc[dignity['Location'].str.contains('Ikeja'), 'Location'] = 'Lagos'
dignity.loc[dignity['Location'].str.contains('Abeokuta'), 'Location'] = 'Ogun'
dignity.loc[dignity['Location'].str.contains('Lokoja'), 'Location'] = 'Kogi'
dignity.loc[dignity['Location'].str.contains('Aba'), 'Location'] = 'Abia'
dignity.loc[dignity['Location'].str.contains('Akure'), 'Location'] = 'Ondo'
dignity.loc[dignity['Location'].str.contains('Port'), 'Location'] = 'Rivers'
dignity.loc[dignity['Location'].str.contains('Abakaliki'), 'Location'] = 'Ebonyi'
dignity.loc[dignity['Location'].str.contains('Enugu'), 'Location'] = 'Enugu'
dignity.loc[dignity['Location'].str.contains('Awka'), 'Location'] = 'Anambra'
dignity.loc[dignity['Location'].str.contains('Jos'), 'Location'] = 'Plateau'
dignity.loc[dignity['Location'].str.contains('Abraka'), 'Location'] = 'Delta'
dignity.loc[dignity['Location'].str.contains('Onitsha'), 'Location'] = 'Anambra'
dignity.loc[dignity['Location'].str.contains('Owerri'), 'Location'] = 'Imo'
dignity.loc[dignity['Location'].str.contains('ibadan'), 'Location'] = 'Oyo'
dignity.loc[dignity['Location'].str.contains('port'), 'Location'] = 'Rivers'
dignity.loc[dignity['Location'].str.contains('Ondo'), 'Location'] = 'Ondo'
dignity.loc[dignity['Location'].str.contains('Victoria'), 'Location'] = 'Lagos'
dignity.loc[dignity['Location'].str.contains('Ghana'), 'Location'] = 'Ghana'
dignity.loc[dignity['Location'].str.contains('USA'), 'Location'] = 'USA'
dignity.loc[dignity['Location'].str.contains('World'), 'Location'] = 'Everywhere'
dignity.loc[dignity['Location'].str.contains('Earth'), 'Location'] = 'Everywhere'
dignity.loc[dignity['Location'].str.contains('Global'), 'Location'] = 'Everywhere'
dignity.loc[dignity['Location'].str.contains('NG'), 'Location'] = 'Nigeria'
dignity.loc[dignity['Location'].str.contains('🌍'), 'Location'] = 'Everywhere'
dignity.loc[dignity['Location'].str.contains('美国纽约州纽约曼哈顿华埠'), 'Location'] = 'China'
dignity.loc[dignity['Location'].str.contains('Ado Ekiti'), 'Location'] = 'Ekiti'
dignity.loc[dignity['Location'].str.contains('Yola'), 'Location'] = 'Adamawa'
dignity.loc[dignity['Location'].str.contains('Yola'), 'Location'] = 'Adamawa'
dignity.loc[dignity['Location'].str.contains('worldwide'), 'Location'] = 'Everywhere'
dignity.loc[dignity['Location'].str.contains('Kaduna'), 'Location'] = 'Kaduna'
dignity.loc[dignity['Location'].str.contains('Makurdi'), 'Location'] = 'Benue'
dignity.loc[dignity['Location'].str.contains('Here'), 'Location'] = 'Everywhere'
dignity.loc[dignity['Location'].str.contains('Near You'), 'Location'] = 'Everywhere'
dignity.loc[dignity['Location'].str.contains('Houston'), 'Location'] = 'USA'
dignity.loc[dignity['Location'].str.contains('Toronto'), 'Location'] = 'Canada'
dignity.loc[dignity['Location'].str.contains('Somewhere'), 'Location'] = 'Everywhere'
dignity.loc[dignity['Location'].str.contains('Yenagoa'), 'Location'] = 'Bayelsa'
dignity.loc[dignity['Location'].str.contains('Benin'), 'Location'] = 'Edo'
dignity.loc[dignity['Location'].str.contains('here'), 'Location'] = 'Everywhere'
dignity.loc[dignity['Location'].str.contains('Atlanta'), 'Location'] = 'USA'
dignity.loc[dignity['Location'].str.contains('Guangzhou'), 'Location'] = 'China'
dignity.loc[dignity['Location'].str.contains('Chicago'), 'Location'] = 'USA'
dignity.loc[dignity['Location'].str.contains('LAGOS'), 'Location'] = 'Lagos'
dignity.loc[dignity['Location'].str.contains('Calabar'), 'Location'] = 'Cross River'
dignity.loc[dignity['Location'].str.contains('Osogbo'), 'Location'] = 'Osun'
dignity.loc[dignity['Location'].str.contains('Umuahia'), 'Location'] = 'Abia'
dignity.loc[dignity['Location'].str.contains('Anambra'), 'Location'] = 'Anambra'
dignity.loc[dignity['Location'].str.contains('PORT'), 'Location'] = 'Rivers'
dignity.loc[dignity['Location'].str.contains('Imo'), 'Location'] = 'Imo'
dignity.loc[dignity['Location'].str.contains('yaba'), 'Location'] = 'Lagos'
dignity.loc[dignity['Location'].str.contains('owerri'), 'Location'] = 'Imo'
dignity.loc[dignity['Location'].str.contains('FCT, ABUJA'), 'Location'] = 'Abuja'
dignity.loc[dignity['Location'].str.contains('Adamawa'), 'Location'] = 'Adamawa'
dignity.loc[dignity['Location'].str.contains('Badagry'), 'Location'] = 'Lagos'
dignity.loc[dignity['Location'].str.contains('enugu'), 'Location'] = 'Enugu'
dignity.loc[dignity['Location'].str.contains('oyo'), 'Location'] = 'Oyo'
dignity.loc[dignity['Location'].str.contains('France'), 'Location'] = 'France'
dignity.loc[dignity['Location'].str.contains('Montréal'), 'Location'] = 'Canada'

In [None]:
#check value counts
dignity.Location.value_counts()

In [None]:
#replace empty strings in Location column  with Np.nan values
labour.Location = labour['Location'].replace(r'^\s*$', np.NaN, regex=True)

In [None]:
#check value counts
labour.Location.value_counts()

In [None]:
#replace empty strings in Location column  with Np.nan values
dignity.Location = dignity['Location'].replace(r'^\s*$', np.NaN, regex=True)

In [None]:
#check value counts
dignity.Location.value_counts()

## Further Cleaning Process

In [None]:
#separate Date Crearted to different Year, Month and Time Column
labour['Date'] = [d.date() for d in labour['Date Created']]
labour['Time'] = [d.time() for d in labour['Date Created']]
labour['Year'] = labour['Date Created'].dt.year
labour['Month'] = labour['Date Created'].dt.month
labour['Day'] = labour['Date Created'].dt.day
labour['Hour'] = labour['Date Created'].dt.hour

In [None]:
#separate Date Crearted to different Year, Month and Time Column
dignity['Date'] = [d.date() for d in dignity['Date Created']]
dignity['Time'] = [d.time() for d in dignity['Date Created']]
dignity['Year'] = dignity['Date Created'].dt.year
dignity['Month'] = dignity['Date Created'].dt.month
dignity['Day'] = dignity['Date Created'].dt.day
dignity['Hour'] = dignity['Date Created'].dt.hour

In [None]:
#Replace numerical month value to string
labour.Month = labour['Date Created'].dt.month_name()

In [None]:
labour

In [None]:
labour.Hour.dtypes

In [None]:
#Replace numerical month value to string
dignity.Month = dignity['Date Created'].dt.month_name()

In [None]:
dignity

In [None]:
#make rows in Career column to title case
labour.Career = labour['Career'].str.upper().str.title()

In [None]:
#make rows in Career column to title case
dignity.Career = dignity['Career'].str.upper().str.title()

## Getting Latitudes and Longitudes

In [None]:
labour['Coordinates']=labour['Coordinates'].fillna('No Location') # Replace "NaN" values with "No Location"

In [None]:
dignity['Coordinates']=dignity['Coordinates'].fillna('No Location') # Replace "NaN" values with "No Location"

In [None]:
#separate Coordinate Columns to individual Latitude and Longitude in labour data
labour['Geo'] = labour['Coordinates'].astype('string')
a=labour['Geo'].astype(str).str.split(expand=True)
a.columns = ['a', 'Longitude','c','Latitude']

In [None]:
#separate Coordinate Columns to individual Latitude and Longitude in dignity data
dignity['Geo'] = dignity['Coordinates'].astype('string')
b=dignity['Geo'].astype(str).str.split(expand=True)
b.columns = ['a', 'Longitude','c','Latitude']

In [None]:
labour =  pd.concat([labour,a], axis=1)

In [None]:
dignity =  pd.concat([dignity,b], axis=1)

In [None]:
#check columns
labour.columns

In [None]:
#drop unwanted columns
cols=['Date Created','Processed_Tweets','Geo','a','c']
labour=labour.drop(columns=cols,axis=1)

In [None]:
#check columns
dignity.columns

In [None]:
#drop unwanted columns
cols=['Date Created','Processed_Tweets','Geo','a','c']
dignity=dignity.drop(columns=cols,axis=1)

In [None]:
#replace location with Np.nan values in Labour.Longitude column
labour.Longitude= labour['Longitude'].replace('Location',np.nan)

In [None]:
#split , with values in Labour.Longitude column
labour['Longitude']=labour['Longitude'].str.split(',', n=1, expand=True)[0]

In [None]:
#split } with values in Labour.Latitude column
labour['Latitude']=labour['Latitude'].str.split('}', n=1, expand=True)[0]

In [None]:
#replace location with Np.nan values in dignity.Longitude column
dignity.Longitude= dignity['Longitude'].replace('Location',np.nan)

In [None]:
#split , with values in Labour.Longitude column
dignity['Longitude']=dignity['Longitude'].str.split(',', n=1, expand=True)[0]

In [None]:
#split } with values in dignity.Latitude column
dignity['Latitude']=dignity['Latitude'].str.split('}', n=1, expand=True)[0]

In [None]:
dignity.Longitude.value_counts()

## SENTIMENTAL ANALYSIS

In [None]:
# write a function to clean the tweets in labour and dignity dataframe
stemmer = nltk.SnowballStemmer("english")
stopword=set(stopwords.words('english'))

def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text

In [None]:
#apply 'clean' to the labour tweet data
labour["Tweet"] = labour["Tweet"].apply(clean)

In [None]:
#apply 'clean' to the dignity tweet data
dignity["Tweet"] = dignity["Tweet"].apply(clean)

In [None]:
#get sentiment scores from the labour data
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#nltk.download('vader_lexicon')
sentiments = SentimentIntensityAnalyzer()
labour["Positive"] = [sentiments.polarity_scores(i)["pos"] for i in labour["Tweet"]]
labour["Negative"] = [sentiments.polarity_scores(i)["neg"] for i in labour["Tweet"]]
labour["Neutral"] = [sentiments.polarity_scores(i)["neu"] for i in labour["Tweet"]]

In [None]:
labour

In [None]:
#get sentiment scores from the dignity data
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#nltk.download('vader_lexicon')
sentiments = SentimentIntensityAnalyzer()
dignity["Positive"] = [sentiments.polarity_scores(i)["pos"] for i in dignity["Tweet"]]
dignity["Negative"] = [sentiments.polarity_scores(i)["neg"] for i in dignity["Tweet"]]
dignity["Neutral"] = [sentiments.polarity_scores(i)["neu"] for i in dignity["Tweet"]]

In [None]:
dignity

In [None]:
# Create function to obtain Subjectivity Score
def getSubjectivity(tweet):
    return TextBlob(tweet).sentiment.subjectivity

# Create function to obtain Polarity Score
def getPolarity(tweet):
    return TextBlob(tweet).sentiment.polarity

# Create function to obtain Sentiment category
def getSentimentTextBlob(polarity):
    if polarity < 0:
        return "Negative"
    elif polarity == 0:
        return "Neutral"
    else:
        return "Positive"

In [None]:
# Apply all functions above to respective columns in labour data
labour['Subjectivity']=labour['Tweet'].apply(getSubjectivity)
labour['Polarity']=labour['Tweet'].apply(getPolarity)
labour['Sentiment']=labour['Polarity'].apply(getSentimentTextBlob)

In [None]:
# Apply all functions above to respective columns in dignity data
dignity['Subjectivity']=dignity['Tweet'].apply(getSubjectivity)
dignity['Polarity']=dignity['Tweet'].apply(getPolarity)
dignity['Sentiment']=dignity['Polarity'].apply(getSentimentTextBlob)

In [None]:
# See quick results of the Sentiment Analysis in labour data
labour['Sentiment'].value_counts()

In [None]:
# See quick results of the Sentiment Analysis in dignity data
dignity['Sentiment'].value_counts()

In [None]:
#write labour to csv file
labour.to_csv('labourfinal.csv', encoding='utf-8', index=False)

In [None]:
#write dignity to csv file
dignity.to_csv('dignityfinal.csv', encoding='utf-8', index=False)

In [None]:
#check labour shape
labour.shape

In [None]:
#check dignity shape
dignity.shape

## Concatenate all data to one final variable

In [None]:
#concatenate both datasets
lab_dig = pd.concat([labour,dignity], axis=0, ignore_index=True)

In [None]:
#check concat data shape
lab_dig.shape

In [None]:
lab_dig

In [None]:
# See quick results of the Sentiment Analysis concat dat
lab_dig['Sentiment'].value_counts()

## Write to CSV for Visualization

In [None]:
#write concat data to csv file
lab_dig.to_csv('labour_dignity_final.csv', encoding='utf-8', index=False)