## Import the Dataset

In [1]:
import pandas as pd
import os
from fnmatch import fnmatch

In [2]:
rootdir = os.path.join(os.getcwd(),os.pardir)
annotated_dir = os.path.join(rootdir,"data/annotations")
pattern = "*.csv"

In [3]:
# annotated files were generated in batches of 500 
# combine them together and add to training set
gen = [file for file in os.listdir(annotated_dir) if fnmatch(file,pattern)]
train_df = pd.DataFrame()
for file in gen:
    filepath=os.path.join(annotated_dir,file)
    print(filepath)
    annotated_df = pd.read_csv(filepath, index_col=0)
    train_df = pd.concat([train_df, annotated_df])
train_df.head()

/home/debanjan/git-projects/sklearn-senti-refactor/notebooks/../data/annotations/annotated08.csv
/home/debanjan/git-projects/sklearn-senti-refactor/notebooks/../data/annotations/annotated10.csv
/home/debanjan/git-projects/sklearn-senti-refactor/notebooks/../data/annotations/annotated03.csv
/home/debanjan/git-projects/sklearn-senti-refactor/notebooks/../data/annotations/annotated09.csv
/home/debanjan/git-projects/sklearn-senti-refactor/notebooks/../data/annotations/annotated01.csv
/home/debanjan/git-projects/sklearn-senti-refactor/notebooks/../data/annotations/annotated02.csv
/home/debanjan/git-projects/sklearn-senti-refactor/notebooks/../data/annotations/annotated04.csv
/home/debanjan/git-projects/sklearn-senti-refactor/notebooks/../data/annotations/annotated11.csv
/home/debanjan/git-projects/sklearn-senti-refactor/notebooks/../data/annotations/annotated05.csv
/home/debanjan/git-projects/sklearn-senti-refactor/notebooks/../data/annotations/annotated06.csv
/home/debanjan/git-projects/sk

Unnamed: 0,Q0,Q1,Q2,Q3,created_at,text
590748,YES,UNKNOWN,AFD,UNKNOWN,2017-07-03 08:57:48,RT FSchaerdel: Das wird eine kurze Entscheidun...
450785,YES,UNKNOWN,AFD,UNKNOWN,2017-06-30 18:32:08,RT AfD_Bund: Dr. Alexander Gauland: »Ehefueral...
576616,YES,UNKNOWN,AFD,UNKNOWN,2017-07-02 22:59:14,RT AfD_SOK: Pressemitteilung des Bundestagskan...
454156,YES,UNKNOWN,AFD,NEGATIVE,2017-06-30 19:11:03,RT erzaehlmirnix: AfD-Logik: LINK LINK
377905,YES,UNKNOWN,AFD,NEGATIVE,2017-06-30 08:56:10,JungeFreiheit Dramatischer Einstellungswandel ...


In [4]:
#quickly take a look at the structure to understand the columns available to us
train_df.columns

Index(['Q0', 'Q1', 'Q2', 'Q3', 'created_at', 'text'], dtype='object')

In [31]:
# 
nrows = None # if you want the whole data
#nrows = 10000 # 10K is fast to load but still varied enough
test_df = pd.read_csv( os.path.join( rootdir,
                                    "data",
                                    "processed",
                                    "tweets.csv"),
                      nrows=nrows)
test_df.set_index('id',inplace=True)

In [6]:
#print out first few lines of dataframe
# be on the lookout for anything that seems out of place
test_df.head()

Unnamed: 0,id,created_at,text
0,875063024486293504,2017-06-14 18:50:57,Spoke to the young leaders of the 18th Annual ...
1,875063094220861440,2017-06-14 18:51:14,RT @ABCNews4: Chris Singleton of CSU drafted b...
2,875063277520388096,2017-06-14 18:51:57,RT @Rubysayzz: CORRUPT SYSTEM PRODUCES & SUSTA...
3,875063305748054016,2017-06-14 18:52:04,RT @Irelandbrexit: Merkel justifies her migran...
4,875063528335474688,2017-06-14 18:52:57,RT @Rubysayzz: CORRUPT SYSTEM PRODUCES & SUSTA...


In [32]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2021752 entries, 0 to 2021751
Data columns (total 3 columns):
id            int64
created_at    object
text          object
dtypes: int64(1), object(2)
memory usage: 46.3+ MB


## Remove Null values from training data

In [7]:
# NULL or NA on text column means that there was an error with the format, 
#    as each line is supposed to have one tweet!!
# NULL or NA on Q0 means that this tweet hasn't been annotated by mistake. 
#    Or that this tweet hasn't been annotated yet. 
#    This enabled us to work with partially annotated files. 
train_df.dropna(subset=['text','Q0'], inplace=True)

In [8]:
# counting the number of entries and looking at cumulative NULLness
# Q0 is completely NULL free
# Q1,Q2,Q3 were left empty for the case where the tweets were judged to be NOT RELEVANT
train_df.info()
# note the Int64Index -- pandas supports 64 bit integers 
#    - we do not have to covert the indices to strings like R

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5092 entries, 590748 to 508736
Data columns (total 6 columns):
Q0            5092 non-null object
Q1            4652 non-null object
Q2            4652 non-null object
Q3            4652 non-null object
created_at    4094 non-null object
text          5092 non-null object
dtypes: object(6)
memory usage: 278.5+ KB


In [46]:
#  Some dates appear to be missing - warrants some investigation!!
missing_df = train_df[train_df.created_at.isna()]
# Seems like we have 998 entries (maybe from the same annotation set) with missing dates
# Can we go into the test set and repopulate the date values?

In [57]:
#turns out that there was an issue with the index. 
# the index of missingdf corresponds to twitter ids 
#   -- which we somehow dropped for the other tweets - but retained for the 998 in missing_df

(test_df
 .loc[missing_df.index,:]
).head()

Unnamed: 0,created_at,text
902112185685565441,2017-08-28 10:14:40,Angela Merkel insists she has no regrets over ...
902612931560574977,2017-08-29 19:24:27,Lol made a friend my first day at csu & now sh...
902593242390286337,2017-08-29 18:06:13,@Model3Owners and yet Merkel said combustion e...
902365247075479553,2017-08-29 03:00:14,@salmaankhwaja Partly Cloudy in #Karachi with ...
902062462710812672,2017-08-28 06:57:05,#GermanyDecids🇩🇪: Too many think pieces on Mer...


In [61]:
#putting everything together
train_df.loc[missing_df.index,'created_at'] = test_df.loc[missing_df.index,'created_at']
train_df.info()
#we fixed the mismatch

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5092 entries, 590748 to 508736
Data columns (total 6 columns):
Q0            5092 non-null object
Q1            4652 non-null object
Q2            4652 non-null object
Q3            4652 non-null object
created_at    5092 non-null object
text          5092 non-null object
dtypes: object(6)
memory usage: 438.5+ KB


In [9]:
#nothing out of the ordinary here
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
id            10000 non-null int64
created_at    10000 non-null object
text          10000 non-null object
dtypes: int64(1), object(2)
memory usage: 234.5+ KB


In [62]:
#test_df = test_df.loc[test_df['lang'] == 'en']
#en_test_df = test_df[['created_at', 'text']]
#print(en_test_df.shape)
#en_test_df.head()
## this was the earlier code to filter out english tweets
##  -- but the process was eating up the RAM 
##  -- now this is done by `src/data/make_dataset.py` using the csv module

In [11]:
time_df = pd.DataFrame(test_df[['created_at']])
time_df = time_df.reset_index(drop=True)
time_df.head()
## this column of timestamps is extracted and kept aside. 
## Later the cleaned tweets shall be combined with this and a dataframe created 
## not needed anymore due to df.apply

Unnamed: 0,created_at
0,2017-06-14 18:50:57
1,2017-06-14 18:51:14
2,2017-06-14 18:51:57
3,2017-06-14 18:52:04
4,2017-06-14 18:52:57


## Data Cleaning

In [12]:
# does two things 
# 1. substitutes undesirable entities like mentions and links with blanks
#   a. remove @mentions
#   b. remove protocol://links.com
#   c. remove special characters like !@#$%^&*()
# 2. replaces multiple spaces by a single space
import re
def clean_tweet(tweet):
        '''
        Utility function to clean tweet text by removing links, special characters
        using simple regex statements.
        '''
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

In [13]:
# loops over rows of dataframe to create a list of dictionaries called parsed_tweets.
# the list of dictionaries is easy to form a new dataframe from
# however, using apply sidesteps this complex process into a one-liner
def parse_data(tweets):
    parsed_tweets = []
    cnt = 0
    for tweet in tweets:
        cnt += 1
        # empty dictionary to store required params of a tweets
        parsed_tweet = {}
        # cleaning text of tweet
        parsed_tweet['text'] = clean_tweet(tweet)
        parsed_tweets.append(parsed_tweet)
    print ("Cleaned data for %d tweet" % cnt)
    return parsed_tweets

In [14]:
# new column is created with clean tweets
clean_test_df = pd.DataFrame(parse_data(test_df['text'].tolist()))

Cleaned data for 10000 tweet


In [15]:
# join two columns into one dataframe
# axis = 1 means that columns are being added
# axis =0, which is the default, adds new rows
clean_test_df = pd.concat([time_df, clean_test_df], axis=1)

In [16]:
clean_train_df = pd.DataFrame(parse_data(train_df['text'].tolist()))
# same process of creating columns and combining them
#   into a dataframe is repeated for the training data

Cleaned data for 5092 tweet


In [17]:
selected_train_df = train_df[['Q0', 'Q1']].reset_index(drop=True)
# creates a new index and demotes the older index to yet another lowly column
# drops the entries if all other columns were NA

In [18]:
clean_train_df = pd.concat([clean_train_df, selected_train_df], axis=1)

In [19]:
clean_train_df.head()

Unnamed: 0,text,Q0,Q1
0,RT FSchaerdel Das wird eine kurze Entscheidung...,YES,UNKNOWN
1,RT AfD Bund Dr Alexander Gauland Ehefueralle E...,YES,UNKNOWN
2,RT AfD SOK Pressemitteilung des Bundestagskand...,YES,UNKNOWN
3,RT erzaehlmirnix AfD Logik LINK LINK,YES,UNKNOWN
4,JungeFreiheit Dramatischer Einstellungswandel ...,YES,UNKNOWN


In [66]:
# everything since clean_tweets was defined can be done in one line like so-- 
test_df.head().text.apply(clean_tweet)

id
875063024486293504    Spoke to the young leaders of the 18th Annual ...
875063094220861440    RT Chris Singleton of CSU drafted by Chicago C...
875063277520388096    RT CORRUPT SYSTEM PRODUCES SUSTAINS BAD PEOPLE...
875063305748054016    RT Merkel justifies her migrant policy by sayi...
875063528335474688    RT CORRUPT SYSTEM PRODUCES SUSTAINS BAD PEOPLE...
Name: text, dtype: object

In [67]:
# putting everything together
clean_train_df = train_df
clean_train_df['text'] = train_df.text.apply(clean_tweet)
clean_train_df.head()

Unnamed: 0,Q0,Q1,Q2,Q3,created_at,text
590748,YES,UNKNOWN,AFD,UNKNOWN,2017-07-03 08:57:48,RT FSchaerdel Das wird eine kurze Entscheidung...
450785,YES,UNKNOWN,AFD,UNKNOWN,2017-06-30 18:32:08,RT AfD Bund Dr Alexander Gauland Ehefueralle E...
576616,YES,UNKNOWN,AFD,UNKNOWN,2017-07-02 22:59:14,RT AfD SOK Pressemitteilung des Bundestagskand...
454156,YES,UNKNOWN,AFD,NEGATIVE,2017-06-30 19:11:03,RT erzaehlmirnix AfD Logik LINK LINK
377905,YES,UNKNOWN,AFD,NEGATIVE,2017-06-30 08:56:10,JungeFreiheit Dramatischer Einstellungswandel ...


In [68]:
#similarly for the test set 
clean_test_df = test_df
clean_test_df['text'] = test_df.text.apply(clean_tweet)
clean_test_df.head()

Unnamed: 0_level_0,created_at,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
875063024486293504,2017-06-14 18:50:57,Spoke to the young leaders of the 18th Annual ...
875063094220861440,2017-06-14 18:51:14,RT Chris Singleton of CSU drafted by Chicago C...
875063277520388096,2017-06-14 18:51:57,RT CORRUPT SYSTEM PRODUCES SUSTAINS BAD PEOPLE...
875063305748054016,2017-06-14 18:52:04,RT Merkel justifies her migrant policy by sayi...
875063528335474688,2017-06-14 18:52:57,RT CORRUPT SYSTEM PRODUCES SUSTAINS BAD PEOPLE...


## Machine Learning to predict Q0 and Q1 for Test Dataset

In [20]:
column_dict = {'Q0': 'Q0_RELEVANT',
               'Q1': 'Q1_mood_of_speaker'}

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def train_and_predict(df_train, df_test, column="Q0",verbose=True):
    cv = TfidfVectorizer(min_df=1, stop_words='english')
    
    mask = (df_train["Q0"] == "YES")
    if column== "Q0":
        df_train_ = df_train
    else:
        mask = df_train["Q0"] == "YES"
        df_train_ = df_train[mask]
        
    tweet_train = cv.fit_transform(df_train_['text'])
    label_train = df_train_[column]
    classifier = RandomForestClassifier(50, random_state=4, verbose=verbose)
    classifier.fit(tweet_train, label_train)
    
    tweet_test = cv.transform(df_test["text"])
    return classifier.predict(tweet_test)

In [22]:
#include columns from original dataset
data = {"text":clean_test_df["text"],
        "created_at":pd.to_datetime(clean_test_df["created_at"])
       }

for column in column_dict.keys():
    print("->>{}".format(column))
    data[column] = train_and_predict(df_train = clean_train_df, df_test = clean_test_df, column = column)

->>Q0


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.7s finished


->>Q1


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.1s finished


In [23]:
df_predictions = pd.DataFrame(data)
for i, row in df_predictions.iterrows():
    if row["Q0"]=="NO" :
        df_predictions.at[i,"Q1"] = "NULL"
df_predictions

Unnamed: 0,Q0,Q1,created_at,text
0,NO,,2017-06-14 18:50:57,Spoke to the young leaders of the 18th Annual ...
1,NO,,2017-06-14 18:51:14,RT Chris Singleton of CSU drafted by Chicago C...
2,NO,,2017-06-14 18:51:57,RT CORRUPT SYSTEM PRODUCES SUSTAINS BAD PEOPLE...
3,YES,UNKNOWN,2017-06-14 18:52:04,RT Merkel justifies her migrant policy by sayi...
4,NO,,2017-06-14 18:52:57,RT CORRUPT SYSTEM PRODUCES SUSTAINS BAD PEOPLE...
5,YES,UNKNOWN,2017-06-14 18:53:09,RT Macron is about to win a big mandate Merkel...
6,NO,,2017-06-14 18:53:12,RT CORRUPT SYSTEM PRODUCES SUSTAINS BAD PEOPLE...
7,NO,,2017-06-14 18:53:17,RT CORRUPT SYSTEM PRODUCES SUSTAINS BAD PEOPLE...
8,NO,,2017-06-14 18:53:19,RT CORRUPT SYSTEM PRODUCES SUSTAINS BAD PEOPLE...
9,YES,UNKNOWN,2017-06-14 18:53:30,Exactly Don t forget Angela Merkel is on that ...


In [24]:
df_predictions.Q1.value_counts()

UNKNOWN     6743
NULL        3252
POSITIVE       4
NEGATIVE       1
Name: Q1, dtype: int64

In [25]:
df_predictions.Q0.value_counts()

YES    6748
NO     3252
Name: Q0, dtype: int64

In [26]:
clean_train_df.Q1.value_counts()

UNKNOWN       4580
NEGATIVE        48
POSITIVE        20
BOTH/MIXED       4
Name: Q1, dtype: int64

In [27]:
train_df.Q2.value_counts()

AFD                  4000
ANGELA MERKEL         501
DIE LINKE             114
MARTIN SCHULZ          15
SPD                    10
CDU                     6
ANGELA  MERKEL          3
ALEXANDER GAULAND       1
MERKEL                  1
CSU                     1
Name: Q2, dtype: int64

__Write the predicted dataset to csv file for further use__

In [28]:
import csv 
df_predictions.to_csv(os.path.join(rootdir,
                                   "data",
                                   "interim",
                                   "predictions01.csv"),
                      index=False,
                      quoting=csv.QUOTE_ALL,
                      quotechar='"',
                     )