In [None]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import GridSearchCV, train_test_split
# from xgboost import XGBClassifier
from sklearn.metrics import roc_curve
from sklearn.impute import KNNImputer

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Covid-tweets/training_data.csv'
file_key_2 = 'Covid-tweets/test_data.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

## Reading data-files
train = pd.read_csv(file_content_stream_1)
test = pd.read_csv(file_content_stream_2)

## Feature Engineering 
train['is_quote'] = train['is_quote'].str.lower()
train['is_quote'] = np.where(train['is_quote'] == 'false', 0, 1)
train['is_retweet'] = train['is_retweet'].str.lower()
train['is_retweet'] = np.where(train['is_retweet'] == 'false', 0, 1)
train['Trump_flag'] = np.where(train['reply_to_screen_name'] == 'realDonaldTrump', 1, 0)
train['jfrketich_flag'] = np.where(train['reply_to_screen_name'] == 'jfrketich', 1, 0)


test['is_quote'] = np.where(test['is_quote'] == False, 0, 1)
test['is_retweet'] = np.where(test['is_retweet'] == False, 0, 1)
test['Trump_flag'] = np.where(test['reply_to_screen_name'] == 'realDonaldTrump', 1, 0)
test['jfrketich_flag'] = np.where(test['reply_to_screen_name'] == 'jfrketich', 1, 0)

In [44]:
train.head()

Unnamed: 0,text,reply_to_screen_name,is_quote,is_retweet,hashtags,country
0,Remember the #WuhanCoronaVirus? The pandemic w...,,1,1,WuhanCoronaVirus KillerCuomo,us
1,My sources @WhiteHouse say 2 tactics will be u...,,1,1,Trump,us
2,I'll venture a wild guess: If you were running...,,1,1,COVID19,us
3,#Pakistan (#GreenStimulus = #Nature protection...,,1,1,Pakistan GreenStimulus Nature Green,us
4,🇺🇸 Pandémie de #coronavirus: 30 pasteurs améri...,,1,1,coronavirus COVID__19 COVIDー19,us


In [47]:
train['reply_to_screen_name'].value_counts()

realDonaldTrump    497
jfrketich          129
NYGovCuomo          72
BorisJohnson        70
InfoInterest        47
                  ... 
SNC_GC               1
starwars             1
UniofOxford          1
OPHA_Ontario         1
SoonerReporter       1
Name: reply_to_screen_name, Length: 7943, dtype: int64

In [48]:
test['reply_to_screen_name'].value_counts()

realDonaldTrump    112
jfrketich           26
paddypower          19
BdaGovernment       16
LukePField          15
                  ... 
FarleyMedia          1
pmagn                1
CllrIanSherwood      1
OprosUK              1
SqueezeJuice         1
Name: reply_to_screen_name, Length: 2418, dtype: int64

In [53]:
sum(test['reply_to_screen_name'].unique() == 'InfoInterest')

1

In [45]:
train['is_quote'].value_counts()

1    150205
0     89795
Name: is_quote, dtype: int64

In [46]:
train['is_retweet'].value_counts()

1    205896
0     34104
Name: is_retweet, dtype: int64

In [7]:
train['country'].value_counts()

us             40000
uk             40000
canada         40000
australia      40000
ireland        40000
new_zealand    40000
Name: country, dtype: int64

In [8]:
train['is_quote'].value_counts()

False               109482
FALSE                89795
True                 21590
TRUE                 19132
Colin o'donoghue         1
Name: is_quote, dtype: int64

In [13]:
sum(train['is_quote'].isna())

0

In [15]:
train[train['is_quote'] == "Colin o'donoghue"]

Unnamed: 0,text,reply_to_screen_name,is_quote,is_retweet,hashtags,country
162661,0,NYCTogether,Colin o'donoghue,ireland,1498,ireland


In [11]:
train.shape

(240000, 6)

In [6]:
test.head()

Unnamed: 0,text,reply_to_screen_name,is_quote,is_retweet,hashtags,Id
0,"Ethical investing is not optional anymore, say...",,False,False,covid19,0
1,#COVID19 | Suite à la conférence de presse du ...,,False,True,COVID19,1
2,"Yesterday, I had a live discussion with @Steve...",,False,True,COVID19,2
3,Nepal - #Coronavirus cases up 24% in a week. D...,,False,True,Coronavirus,3
4,American economy jumped up a % big news story...,,False,False,LysolAndCloroxSales,4


# Feature Engineering 

In [37]:
train['is_quote'] = train['is_quote'].str.lower()
train['is_quote'] = np.where(train['is_quote'] == 'false', 0, 1)

test['is_quote'] = np.where(test['is_quote'] == False, 0, 1)

In [40]:
train['is_retweet'].value_counts()

True       87067
TRUE       74823
False      44005
FALSE      34104
ireland        1
Name: is_retweet, dtype: int64

In [41]:
train[train['is_retweet'] == 'ireland']

Unnamed: 0,text,reply_to_screen_name,is_quote,is_retweet,hashtags,country
162661,0,NYCTogether,1,ireland,1498,ireland


In [42]:
test['is_retweet'].value_counts()

True     40284
False    19716
Name: is_retweet, dtype: int64