In [4]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.multiclass import OneVsRestClassifier
# from xgboost import XGBClassifier


from Help_Funs import count_chars, count_words, count_capital_chars, count_capital_words, count_sent, count_unique_words, count_stopwords, count_hashtags 

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Covid-tweets/training_data.csv'
file_key_2 = 'Covid-tweets/test_data.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

## Reading data-files
train = pd.read_csv(file_content_stream_1)
test = pd.read_csv(file_content_stream_2)

## Feature Engineering 
train['is_quote'] = train['is_quote'].str.lower()
train['is_quote'] = np.where(train['is_quote'] == 'false', 0, 1)
train['is_retweet'] = train['is_retweet'].str.lower()
train['is_retweet'] = np.where(train['is_retweet'] == 'false', 0, 1)
train['Trump_flag'] = np.where(train['reply_to_screen_name'] == 'realDonaldTrump', 1, 0)
train['jfrketich_flag'] = np.where(train['reply_to_screen_name'] == 'jfrketich', 1, 0)


test['is_quote'] = np.where(test['is_quote'] == False, 0, 1)
test['is_retweet'] = np.where(test['is_retweet'] == False, 0, 1)
test['Trump_flag'] = np.where(test['reply_to_screen_name'] == 'realDonaldTrump', 1, 0)
test['jfrketich_flag'] = np.where(test['reply_to_screen_name'] == 'jfrketich', 1, 0)

# Basic Feature Engineering

In [7]:
train['char_count'] = train['text'].apply(lambda x: count_chars(x))
train['word_count'] = train['text'].apply(lambda x: count_words(x))
train['sent_count'] = train['text'].apply(lambda x: count_sent(x))
train['capital_char_count'] = train['text'].apply(lambda x: count_capital_chars(x))
train['capital_word_count'] = train['text'].apply(lambda x: count_capital_words(x))
# train['quoted_word_count'] = train['text'].apply(lambda x: count_words_in_quotes(x))
train['stopword_count'] = train['text'].apply(lambda x: count_stopwords(x))
train['unique_word_count'] = train['text'].apply(lambda x: count_unique_words(x))
                                                 
test['char_count'] = test['text'].apply(lambda x: count_chars(x))
test['word_count'] = test['text'].apply(lambda x: count_words(x))
test['sent_count'] = test['text'].apply(lambda x: count_sent(x))
test['capital_char_count'] = test['text'].apply(lambda x: count_capital_chars(x))
test['capital_word_count'] = test['text'].apply(lambda x: count_capital_words(x))
# test['quoted_word_count'] = test['text'].apply(lambda x: count_words_in_quotes(x))
test['stopword_count'] = test['text'].apply(lambda x: count_stopwords(x))
test['unique_word_count'] = test['text'].apply(lambda x: count_unique_words(x))
                                                 
## Average word length
train['avg_wordlength'] = train['char_count'] / train['word_count']
test['avg_wordlength'] = test['char_count'] / test['word_count']

## Average sentence lenght
train['avg_sentlength'] = train['word_count'] / train['sent_count']
test['avg_sentlength'] = test['word_count'] / test['sent_count']

## Unique words vs count words
train['unique_vs_words'] = train['unique_word_count'] / train['word_count']
test['unique_vs_words'] = test['unique_word_count'] / test['word_count']

## stopwords vs count words
train['stopwords_vs_words'] = train['stopword_count'] / train['word_count']
test['stopwords_vs_words'] = test['stopword_count'] / test['word_count']

In [12]:
## Defining input and target
X = train.drop(columns = ['text', 'reply_to_screen_name', 'hashtags', 'country'], axis = 1)
Y = train['country']
Y = np.where(Y == 'us', 0, 
             np.where(Y == 'uk', 1, 
                      np.where(Y == 'canada', 2, 
                               np.where(Y == 'australia', 3,
                                        np.where(Y == 'ireland', 4, 5)))))

In [14]:
Y

array([0, 0, 0, ..., 5, 5, 5])

In [11]:
Y.value_counts()

us             40000
uk             40000
canada         40000
australia      40000
ireland        40000
new_zealand    40000
Name: country, dtype: int64

In [8]:
train.head()

Unnamed: 0,text,reply_to_screen_name,is_quote,is_retweet,hashtags,country,Trump_flag,jfrketich_flag,char_count,word_count,sent_count,capital_char_count,capital_word_count,stopword_count,unique_word_count,avg_wordlength,avg_sentlength,unique_vs_words,stopwords_vs_words
0,Remember the #WuhanCoronaVirus? The pandemic w...,,1,1,WuhanCoronaVirus KillerCuomo,us,0,0,267,41,5,12,0,17,36,6.512195,8.2,0.878049,0.414634
1,My sources @WhiteHouse say 2 tactics will be u...,,1,1,Trump,us,0,0,281,48,6,22,2,11,42,5.854167,8.0,0.875,0.229167
2,I'll venture a wild guess: If you were running...,,1,1,COVID19,us,0,0,292,50,3,20,3,16,44,5.84,16.666667,0.88,0.32
3,#Pakistan (#GreenStimulus = #Nature protection...,,1,1,Pakistan GreenStimulus Nature Green,us,0,0,236,31,1,23,1,8,30,7.612903,31.0,0.967742,0.258065
4,🇺🇸 Pandémie de #coronavirus: 30 pasteurs améri...,,1,1,coronavirus COVID__19 COVIDー19,us,0,0,279,35,3,18,2,2,35,7.971429,11.666667,1.0,0.057143


In [3]:
train['hashtags'].value_counts()

COVID19                        58516
coronavirus                    13002
Covid19                         8020
COVID                           4649
covid19                         4578
                               ...  
covid19 Lockdown                   1
NCAAFootball coronavirus           1
Liverpool Covid19 JustsayNO        1
coronavirus DYK                    1
Covid_19 HopeAlive                 1
Name: hashtags, Length: 80149, dtype: int64

In [47]:
train['reply_to_screen_name'].value_counts()

realDonaldTrump    497
jfrketich          129
NYGovCuomo          72
BorisJohnson        70
InfoInterest        47
                  ... 
SNC_GC               1
starwars             1
UniofOxford          1
OPHA_Ontario         1
SoonerReporter       1
Name: reply_to_screen_name, Length: 7943, dtype: int64

In [48]:
test['reply_to_screen_name'].value_counts()

realDonaldTrump    112
jfrketich           26
paddypower          19
BdaGovernment       16
LukePField          15
                  ... 
FarleyMedia          1
pmagn                1
CllrIanSherwood      1
OprosUK              1
SqueezeJuice         1
Name: reply_to_screen_name, Length: 2418, dtype: int64

In [53]:
sum(test['reply_to_screen_name'].unique() == 'InfoInterest')

1

In [45]:
train['is_quote'].value_counts()

1    150205
0     89795
Name: is_quote, dtype: int64

In [46]:
train['is_retweet'].value_counts()

1    205896
0     34104
Name: is_retweet, dtype: int64

In [7]:
train['country'].value_counts()

us             40000
uk             40000
canada         40000
australia      40000
ireland        40000
new_zealand    40000
Name: country, dtype: int64

In [8]:
train['is_quote'].value_counts()

False               109482
FALSE                89795
True                 21590
TRUE                 19132
Colin o'donoghue         1
Name: is_quote, dtype: int64

In [13]:
sum(train['is_quote'].isna())

0

In [15]:
train[train['is_quote'] == "Colin o'donoghue"]

Unnamed: 0,text,reply_to_screen_name,is_quote,is_retweet,hashtags,country
162661,0,NYCTogether,Colin o'donoghue,ireland,1498,ireland


In [11]:
train.shape

(240000, 6)

In [6]:
test.head()

Unnamed: 0,text,reply_to_screen_name,is_quote,is_retweet,hashtags,Id
0,"Ethical investing is not optional anymore, say...",,False,False,covid19,0
1,#COVID19 | Suite à la conférence de presse du ...,,False,True,COVID19,1
2,"Yesterday, I had a live discussion with @Steve...",,False,True,COVID19,2
3,Nepal - #Coronavirus cases up 24% in a week. D...,,False,True,Coronavirus,3
4,American economy jumped up a % big news story...,,False,False,LysolAndCloroxSales,4


# Feature Engineering 

In [37]:
train['is_quote'] = train['is_quote'].str.lower()
train['is_quote'] = np.where(train['is_quote'] == 'false', 0, 1)

test['is_quote'] = np.where(test['is_quote'] == False, 0, 1)

In [40]:
train['is_retweet'].value_counts()

True       87067
TRUE       74823
False      44005
FALSE      34104
ireland        1
Name: is_retweet, dtype: int64

In [41]:
train[train['is_retweet'] == 'ireland']

Unnamed: 0,text,reply_to_screen_name,is_quote,is_retweet,hashtags,country
162661,0,NYCTogether,1,ireland,1498,ireland


In [42]:
test['is_retweet'].value_counts()

True     40284
False    19716
Name: is_retweet, dtype: int64