## Import the Dataset

In [1]:
import pandas as pd
import os
from fnmatch import fnmatch

In [2]:
rootdir = os.path.join(os.getcwd(),os.pardir)
annotated_dir = os.path.join(rootdir,"data/annotations")
pattern = "*.csv"

In [3]:
# annotated files were generated in batches of 500 
# combine them together and add to training set
gen = [file for file in os.listdir(annotated_dir) if fnmatch(file,pattern)]
train_df = pd.DataFrame()
for file in gen:
    filepath=os.path.join(annotated_dir,file)
    print(file)
    annotated_df = pd.read_csv(filepath, index_col=0)
    train_df = pd.concat([train_df, annotated_df])
train_df.head()

annotated08.csv
annotated10.csv
annotated03.csv
annotated09.csv
annotated01.csv
annotated02.csv
annotated04.csv
annotated11.csv
annotated05.csv
annotated06.csv
annotated12.csv
annotated07.csv


Unnamed: 0,Q0,Q1,Q2,Q3,created_at,text
590748,YES,UNKNOWN,AFD,UNKNOWN,2017-07-03 08:57:48,RT FSchaerdel: Das wird eine kurze Entscheidun...
450785,YES,UNKNOWN,AFD,UNKNOWN,2017-06-30 18:32:08,RT AfD_Bund: Dr. Alexander Gauland: »Ehefueral...
576616,YES,UNKNOWN,AFD,UNKNOWN,2017-07-02 22:59:14,RT AfD_SOK: Pressemitteilung des Bundestagskan...
454156,YES,UNKNOWN,AFD,NEGATIVE,2017-06-30 19:11:03,RT erzaehlmirnix: AfD-Logik: LINK LINK
377905,YES,UNKNOWN,AFD,NEGATIVE,2017-06-30 08:56:10,JungeFreiheit Dramatischer Einstellungswandel ...


In [4]:
#quickly take a look at the structure to understand the columns available to us
train_df.columns

Index(['Q0', 'Q1', 'Q2', 'Q3', 'created_at', 'text'], dtype='object')

In [5]:
# 
nrows = None # if you want the whole data
#nrows = 10000 # 10K is fast to load but still varied enough
test_df = pd.read_csv( os.path.join( rootdir,
                                    "data",
                                    "processed",
                                    "tweets.csv"),
                      nrows=nrows)
test_df.set_index('id',inplace=True)

In [6]:
#print out first few lines of dataframe
# be on the lookout for anything that seems out of place
test_df.head()

Unnamed: 0_level_0,created_at,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
875063024486293504,2017-06-14 18:50:57,Spoke to the young leaders of the 18th Annual ...
875063094220861440,2017-06-14 18:51:14,RT @ABCNews4: Chris Singleton of CSU drafted b...
875063277520388096,2017-06-14 18:51:57,RT @Rubysayzz: CORRUPT SYSTEM PRODUCES & SUSTA...
875063305748054016,2017-06-14 18:52:04,RT @Irelandbrexit: Merkel justifies her migran...
875063528335474688,2017-06-14 18:52:57,RT @Rubysayzz: CORRUPT SYSTEM PRODUCES & SUSTA...


In [7]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2021752 entries, 875063024486293504 to 915725234233159680
Data columns (total 2 columns):
created_at    object
text          object
dtypes: object(2)
memory usage: 46.3+ MB


## Remove Null values from training data

In [8]:
# NULL or NA on text column means that there was an error with the format, 
#    as each line is supposed to have one tweet!!
# NULL or NA on Q0 means that this tweet hasn't been annotated by mistake. 
#    Or that this tweet hasn't been annotated yet. 
#    This enabled us to work with partially annotated files. 
train_df.dropna(subset=['text','Q0'], inplace=True)

In [9]:
# counting the number of entries and looking at cumulative NULLness
# Q0 is completely NULL free
# Q1,Q2,Q3 were left empty for the case where the tweets were judged to be NOT RELEVANT
train_df.info()
# note the Int64Index -- pandas supports 64 bit integers 
#    - we do not have to covert the indices to strings like R

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5092 entries, 590748 to 508736
Data columns (total 6 columns):
Q0            5092 non-null object
Q1            4652 non-null object
Q2            4652 non-null object
Q3            4652 non-null object
created_at    4094 non-null object
text          5092 non-null object
dtypes: object(6)
memory usage: 278.5+ KB


In [10]:
#  Some dates appear to be missing - warrants some investigation!!
missing_df = train_df[train_df.created_at.isna()]
# Seems like we have 998 entries (maybe from the same annotation set) with missing dates
# Can we go into the test set and repopulate the date values?

In [11]:
#turns out that there was an issue with the index. 
# the index of missingdf corresponds to twitter ids 
#   -- which we somehow dropped for the other tweets - but retained for the 998 in missing_df

(test_df
 .loc[missing_df.index,:]
).head()

Unnamed: 0,created_at,text
902112185685565441,2017-08-28 10:14:40,Angela Merkel insists she has no regrets over ...
902612931560574977,2017-08-29 19:24:27,Lol made a friend my first day at csu & now sh...
902593242390286337,2017-08-29 18:06:13,@Model3Owners and yet Merkel said combustion e...
902365247075479553,2017-08-29 03:00:14,@salmaankhwaja Partly Cloudy in #Karachi with ...
902062462710812672,2017-08-28 06:57:05,#GermanyDecids🇩🇪: Too many think pieces on Mer...


In [12]:
#putting everything together
train_df.loc[missing_df.index,'created_at'] = test_df.loc[missing_df.index,'created_at']
train_df.info()
#we fixed the mismatch

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5092 entries, 590748 to 508736
Data columns (total 6 columns):
Q0            5092 non-null object
Q1            4652 non-null object
Q2            4652 non-null object
Q3            4652 non-null object
created_at    5092 non-null object
text          5092 non-null object
dtypes: object(6)
memory usage: 438.5+ KB


In [13]:
#nothing out of the ordinary here
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2021752 entries, 875063024486293504 to 915725234233159680
Data columns (total 2 columns):
created_at    object
text          object
dtypes: object(2)
memory usage: 126.3+ MB


In [14]:
#test_df = test_df.loc[test_df['lang'] == 'en']
#en_test_df = test_df[['created_at', 'text']]
#print(en_test_df.shape)
#en_test_df.head()
## this was the earlier code to filter out english tweets
##  -- but the process was eating up the RAM 
##  -- now this is done by `src/data/make_dataset.py` using the csv module

In [15]:
#time_df = pd.DataFrame(test_df[['created_at']])
#time_df = time_df.reset_index(drop=True)
#time_df.head()
## this column of timestamps is extracted and kept aside. 
## Later the cleaned tweets shall be combined with this and a dataframe created 
## not needed anymore due to df.apply

## Data Cleaning

In [16]:
# does two things 
# 1. substitutes undesirable entities like mentions and links with blanks
#   a. remove @mentions
#   b. remove protocol://links.com
#   c. remove special characters like !@#$%^&*()
# 2. replaces multiple spaces by a single space
import re
def clean_tweet(tweet):
        '''
        Utility function to clean tweet text by removing links, special characters
        using simple regex statements.
        '''
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

In [17]:
# loops over rows of dataframe to create a list of dictionaries called parsed_tweets.
# the list of dictionaries is easy to form a new dataframe from
# however, using apply sidesteps this complex process into a one-liner


# def parse_data(tweets):
#     parsed_tweets = []
#     cnt = 0
#     for tweet in tweets:
#         cnt += 1
#         # empty dictionary to store required params of a tweets
#         parsed_tweet = {}
#         # cleaning text of tweet
#         parsed_tweet['text'] = clean_tweet(tweet)
#         parsed_tweets.append(parsed_tweet)
#     print ("Cleaned data for %d tweet" % cnt)
#     return parsed_tweets

In [18]:
# new column is created with clean tweets
# clean_test_df = pd.DataFrame(parse_data(test_df['text'].tolist()))

In [19]:
# join two columns into one dataframe
# axis = 1 means that columns are being added
# axis =0, which is the default, adds new rows
# clean_test_df = pd.concat([time_df, clean_test_df], axis=1)

In [20]:
# clean_train_df = pd.DataFrame(parse_data(train_df['text'].tolist()))
# same process of creating columns and combining them
#   into a dataframe is repeated for the training data

In [21]:
# selected_train_df = train_df[['Q0', 'Q1']].reset_index(drop=True)
# creates a new index and demotes the older index to yet another lowly column
# drops the entries if all other columns were NA

In [22]:
# clean_train_df = pd.concat([clean_train_df, selected_train_df], axis=1)

In [23]:
# clean_train_df.head()

In [24]:
# everything since clean_tweets was defined can be done in one line like so-- 
test_df.head().text.apply(clean_tweet)
# this is why the code above is commented out
# TIP: use ctrl+/ to comment out code

id
875063024486293504    Spoke to the young leaders of the 18th Annual ...
875063094220861440    RT Chris Singleton of CSU drafted by Chicago C...
875063277520388096    RT CORRUPT SYSTEM PRODUCES SUSTAINS BAD PEOPLE...
875063305748054016    RT Merkel justifies her migrant policy by sayi...
875063528335474688    RT CORRUPT SYSTEM PRODUCES SUSTAINS BAD PEOPLE...
Name: text, dtype: object

In [25]:
# putting everything together
clean_train_df = train_df
clean_train_df['text'] = train_df.text.apply(clean_tweet)
clean_train_df.head()

Unnamed: 0,Q0,Q1,Q2,Q3,created_at,text
590748,YES,UNKNOWN,AFD,UNKNOWN,2017-07-03 08:57:48,RT FSchaerdel Das wird eine kurze Entscheidung...
450785,YES,UNKNOWN,AFD,UNKNOWN,2017-06-30 18:32:08,RT AfD Bund Dr Alexander Gauland Ehefueralle E...
576616,YES,UNKNOWN,AFD,UNKNOWN,2017-07-02 22:59:14,RT AfD SOK Pressemitteilung des Bundestagskand...
454156,YES,UNKNOWN,AFD,NEGATIVE,2017-06-30 19:11:03,RT erzaehlmirnix AfD Logik LINK LINK
377905,YES,UNKNOWN,AFD,NEGATIVE,2017-06-30 08:56:10,JungeFreiheit Dramatischer Einstellungswandel ...


In [26]:
#similarly for the test set 
clean_test_df = test_df
clean_test_df['text'] = test_df.text.apply(clean_tweet)
clean_test_df.head()

Unnamed: 0_level_0,created_at,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
875063024486293504,2017-06-14 18:50:57,Spoke to the young leaders of the 18th Annual ...
875063094220861440,2017-06-14 18:51:14,RT Chris Singleton of CSU drafted by Chicago C...
875063277520388096,2017-06-14 18:51:57,RT CORRUPT SYSTEM PRODUCES SUSTAINS BAD PEOPLE...
875063305748054016,2017-06-14 18:52:04,RT Merkel justifies her migrant policy by sayi...
875063528335474688,2017-06-14 18:52:57,RT CORRUPT SYSTEM PRODUCES SUSTAINS BAD PEOPLE...


## Machine Learning to predict Q0 and Q1 for Test Dataset

In [27]:
# this is not required anymore
# in an earlier version the column names were inconsistent
column_dict = {'Q0': 'Q0_RELEVANT',
               'Q1': 'Q1_mood_of_speaker'}

In [28]:
# try to guess what these functions do
# it is very important to understand what these do

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# takes in the training and testing dataframes 
# uses the trained model to make predictions
# here we use Random Forest on the annotated tweets
def train_and_predict(df_train, df_test, column="Q0",verbose=True):
    cv = TfidfVectorizer(min_df=1, stop_words='english')
    
    mask = (df_train["Q0"] == "YES")
    if column== "Q0":
        df_train_ = df_train
    else:
        mask = df_train["Q0"] == "YES"
        df_train_ = df_train[mask]
        
    tweet_train = cv.fit_transform(df_train_['text'])
    label_train = df_train_[column]
    classifier = RandomForestClassifier(50, random_state=4, verbose=verbose)
    classifier.fit(tweet_train, label_train)
    
    tweet_test = cv.transform(df_test["text"])
    return classifier.predict(tweet_test)

In [29]:
#include columns from original dataset
data = {"text":clean_test_df["text"],
        "created_at":pd.to_datetime(clean_test_df["created_at"])
       }

for column in column_dict.keys():
    print("->>{}".format(column))
    data[column] = train_and_predict(df_train = clean_train_df, df_test = clean_test_df, column = column)

->>Q0


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   24.2s finished


->>Q1


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   23.6s finished


In [30]:
# if tweet is not relevant 
# -- ignore the Q1 prediction by using "NULL" as the value

df_predictions = pd.DataFrame(data)
#for i, row in df_predictions.iterrows():
#    if row["Q0"]=="NO" :
#        df_predictions.at[i,"Q1"] = "NULL"

#replacing the "for loop" with the vectorised "ufunc" should yield faster execution
not_relevant_indices = df_predictions.query("Q0=='NO'").index
df_predictions.loc[not_relevant_indices,"Q1"] = 'NULL'
df_predictions.head()

Unnamed: 0_level_0,Q0,Q1,created_at,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
875063024486293504,NO,,2017-06-14 18:50:57,Spoke to the young leaders of the 18th Annual ...
875063094220861440,NO,,2017-06-14 18:51:14,RT Chris Singleton of CSU drafted by Chicago C...
875063277520388096,NO,,2017-06-14 18:51:57,RT CORRUPT SYSTEM PRODUCES SUSTAINS BAD PEOPLE...
875063305748054016,YES,UNKNOWN,2017-06-14 18:52:04,RT Merkel justifies her migrant policy by sayi...
875063528335474688,NO,,2017-06-14 18:52:57,RT CORRUPT SYSTEM PRODUCES SUSTAINS BAD PEOPLE...


In [31]:
from IPython.core.display import HTML, display
def print_html(input):
    return display(HTML(input))

In [32]:
print_html("<h2>Value counts in model predictions</h2>")
for col in ["Q0","Q1"]:
    print_html("<h3>Value Counts for column \"{}\"</h3>".format(col))
    display(df_predictions[col].value_counts().to_frame())
# observations
# there is an overwhelming majority of relevant tweets - more than 85%
# note that NULL values in Q1 only happen if  Q0 is NO -- hence their counts match
#
# looking at the values of Q1 
# -- we observe that the annotation data is largely biased towards unknown
# thus we would do well to drop "Speaker's Mood" from our analysis

Unnamed: 0,Q0
YES,1634248
NO,387504


Unnamed: 0,Q1
UNKNOWN,1630568
,387504
POSITIVE,2018
NEGATIVE,1658
BOTH/MIXED,4


__Write the predicted dataset to csv file for further use__

In [33]:
import csv 
df_predictions.to_csv(os.path.join(rootdir,
                                   "data",
                                   "interim",
                                   "predictions.csv"),
                      index=False,
                      quoting=csv.QUOTE_ALL,
                      quotechar='"',
                     )