# CZ4034 Information Retrieval - Group 17

## Crawling Python Notebook (with Data Cleaning)

### This notebook contains the main code used for Crawling/Scraping Twitter using SNScrape, with some of the Data Cleaning code.

In [None]:
import pandas as pd
import snscrape.modules.twitter as sntwitter
import itertools
import time

In [None]:
# Python Dictionary of Countries mapped to their Main Cities (usually Capital Cities)
countriesDict = {
    "Iran":"Tehran", "Israel":"Jerusalem", 
    "Saudi Arabia":"Riyadh", "China":"Hong Kong",
    "Ukraine":"Kyiv", "Russia":"Moscow",
    "UK":"London", "India":"New Delhi", 
    "Mexico":"Mexico City", "Canada":"Ottawa", 
    "Brazil":"Brasilia", "South Korea":"Seoul",
    "Philippines":"Manila", "Kenya":"Nairobi",
    "Nigeria":"Abuja","Germany":"Berlin",
    "Taiwan":"Taipei","France":"Paris",
    "Afghanistan":"Kabul", "Indonesia":"Jakarta",
    "Japan":"Tokyo", "Australia":"Canberra",
    "Singapore":"Singapore"
}

num_tweets_per_tag = 5000

In [None]:
# Keywords
keywords = [ 
            # Mentions of Trump: 7
            "#Trump", "#trump", "#Trump2020", "#DonaldTrump", "DonaldJTrump", "Donald Trump", "Trump"
            
            # Pro-Trump: 8
            '#VoteTrump', "VoteRed", "#MAGA", "#PresidentTrump",  '#MakeAmericaGreatAgain', '#TeamTrump',  '#DrainTheSwamp',  "#MyPresident",
            
            # Anti-Trump: 7
            "#VoteTrumpOut", "#DumpTrump", '#TrumpIsPathetic', '#TrumpCorruption', '#VoteHimOut', '#YoureFiredTrump', '#TrumpHasToGo',
            
            # Mentions of Biden: 6
            "#Biden", "#biden", "#Biden2020", "Joe Biden", "#JoeBiden", "Biden",
            
            # Pro-Biden: 6
            "#VoteBiden", "VoteBlue", "#VoteBlueToSaveAmerica", "#BlueWave2020", '#TeamBiden', '#JoeMentum', 
            
            # Anti-Biden: 7
            "Sleepy Joe", "#SleepyJoe", "HidenBiden", "#CreepyJoeBiden", "#NeverBiden", "#BidenUkraineScandal", '#HunterBiden',
            
            # Miscellaneous: 1
            "#USElections"
]

In [None]:
# This is the main method used to scrape Twitter data (tweets) using SNScrape
def scrape_data(countryName, countriesDict=countriesDict, withinRange = 1000, num_tweets_per_tag=num_tweets_per_tag):
    start = time.time()
    df = pd.DataFrame()
    for word in keywords:
        try:
            df = df.append(pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper(
                f'{word} near:"{countriesDict[countryName]}" within:{withinRange}km lang:en since:2020-09-01 until:2020-12-31').get_items(), num_tweets_per_tag)))
        except Exception as e:
            print(f"An error occured: :(\n")
            continue
    if len(df) < 1000:
        print(f"Number of tweets for {countryName} is lower than expected! df shape: {df.shape}")
    df['username'] =  df['user'].apply(lambda x: x['username'])
    df['country'] = countryName
    df_ = df[["username", "content", "date", "country", "replyCount", "retweetCount", "likeCount", "url"]]
    df_.to_csv(f'snscrape_{countryName}.csv', index = False)
    print(f"Shape of df for {countryName}: {df_.shape}, Time taken: {((time.time() - start)/60):.1f} mins")
    return df_

In [None]:
# Initializing Dictionary of DataFrames for Each of the 23 Countries
countriesDf = {}

In [None]:
# This code block scrapes data for each country in the countriesDict dictionary.
# For some countries, the range parameter for SNScrape has been specified.

for country in countriesDict.keys():
    if country in countriesDf.keys():
        continue
    if country in ['Russia']:
        withinRange=1000
    elif country in ['Mexico']:
        withinRange=500
    elif country in ['Canada']:
        withinRange=100
    elif country in ['Singapore']:
        withinRange=50
    else:
        withinRange=800
    countriesDf[country] = scrape_data(country, withinRange=withinRange)

In [None]:
# To check the Number of Tweets found for each Country
for country, countryDf in countriesDf.items():
    print(f"{country}: {len(countryDf)}")

In [None]:
# To create the main DataFrame of tweets
df = pd.DataFrame()
for countryDf in countriesDf.values():
    df = df.append(countryDf)

print(df.shape)

In [None]:
# Cleaning Data
df_indexes_v2 = []
user_dict = {}
for i in range(len(df)):
    tweet = df["content"].iloc[i]
    
    # To remove tweets that have more hashtags than normal text
    word_list = tweet.lower().split()
    num_normal = 0
    num_tags = 0
    for j in range(len(word_list)):
        temp = word_list[j]
        if temp[0] == '#':
            num_tags += 1
        else:
            num_normal += 1
    if num_tags > num_normal:
        continue
    
    # To choose only the latest tweet from a user to prevent multiple tweets from same user
    user = df["username"].iloc[i]
    user_dict[user] = i
    
for value in user_dict.values():
    df_indexes_v2.append(value)

df_v2 = df.iloc[df_indexes_v2]
print(f'Shape of df after cleaning: {df_v2.shape}')

In [None]:
# Shuffling tweets in version 2 of the dataframe, and saving to a CSV file
df_v2 = df_v2.drop_duplicates(subset='content')
df_v2 = df_v2.sample(frac=1).reset_index(drop=True)
print(df_v2.shape)
df_v2.to_csv("cz4034_scraped_data.csv", encoding = "utf-8-sig", index=False)

In [None]:
# To print the unique countries in the DataFrame
print(df_v2['country'].unique())

In [None]:
# To print the number of tweets for each country
print(df_v2.groupby('country')['content'].nunique())