In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedShuffleSplit

## Reading data

In [3]:
train = pd.read_csv("train_F3WbcTw.csv")
test = pd.read_csv("test_tOlRoBf.csv")
sample = pd.read_csv("sample_submission_i5xnIZD.csv")

## Data Cleaning

In [4]:
train.head()

Unnamed: 0,unique_hash,text,drug,sentiment
0,2e180be4c9214c1f5ab51fd8cc32bc80c9f612e0,Autoimmune diseases tend to come in clusters. ...,gilenya,2
1,9eba8f80e7e20f3a2f48685530748fbfa95943e4,I can completely understand why you’d want to ...,gilenya,2
2,fe809672251f6bd0d986e00380f48d047c7e7b76,Interesting that it only targets S1P-1/5 recep...,fingolimod,2
3,bd22104dfa9ec80db4099523e03fae7a52735eb6,"Very interesting, grand merci. Now I wonder wh...",ocrevus,2
4,b227688381f9b25e5b65109dd00f7f895e838249,"Hi everybody, My latest MRI results for Brain ...",gilenya,1


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5279 entries, 0 to 5278
Data columns (total 4 columns):
unique_hash    5279 non-null object
text           5279 non-null object
drug           5279 non-null object
sentiment      5279 non-null int64
dtypes: int64(1), object(3)
memory usage: 165.0+ KB


In [6]:
train.drug.value_counts()

ocrevus                         676
gilenya                         666
ocrelizumab                     441
entyvio                         303
humira                          270
fingolimod                      238
remicade                        229
opdivo                          224
tarceva                         218
cladribine                      200
keytruda                        199
stelara                         161
tagrisso                        161
alimta                          146
lucentis                         67
eylea                            65
avastin                          52
nivolumab                        50
cimzia                           48
alectinib                        43
crizotinib                       43
vitrectomy                       41
simponi                          39
erlotinib                        35
tecentriq                        34
xalkori                          33
pemetrexed                       33
tysabri                     

In [7]:
train.sentiment.value_counts()

2    3825
1     837
0     617
Name: sentiment, dtype: int64

In [8]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2924 entries, 0 to 2923
Data columns (total 3 columns):
unique_hash    2924 non-null object
text           2924 non-null object
drug           2924 non-null object
dtypes: object(3)
memory usage: 68.6+ KB


In [9]:
sample.head()

Unnamed: 0,unique_hash,sentiment
0,9e9a8166b84114aca147bf409f6f956635034c08,0
1,e747e6822c867571afe7b907b51f0f2ca67b0e1a,0
2,50b6d851bcff4f35afe354937949e9948975adf7,0
3,7f82ec2176ae6ab0b5d20b5ffc767ac829f384ae,0
4,8b37d169dee5bdae27060949242fb54feb6a7f7f,0


## Feature Engg and EDA

#### Quick feature engg and model gen

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder

In [17]:
train_df = train.copy()
test_df = test.copy()

In [15]:
v = TfidfVectorizer()

In [20]:
a = train_df.text
b = test_df.text

In [22]:
d = a.append(b)

In [23]:
v.fit(d)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [24]:
train_w_f = v.transform(train_df.text)

In [25]:
le  = LabelEncoder()

In [26]:
bn = train_df.drug.append(test_df.drug)

In [27]:
le.fit(bn)

LabelEncoder()

In [28]:
train_n = le.transform(train_df.drug)

In [36]:
train_n = np.array(train_n)

In [40]:
train_n = train_n.reshape(5279,1)

In [47]:
newfeatures = hstack((train_w_f, train_n)).tocsr()

In [52]:
x = newfeatures

In [53]:
y = train_df.sentiment

In [55]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=.20, random_state=0)
for train_index, test_index in sss.split(x, y):
    x_train,x_test,y_train,y_test = x[train_index],x[test_index],y[train_index],y[test_index]

In [57]:
from sklearn.ensemble import RandomForestClassifier

In [58]:
rf = RandomForestClassifier(n_estimators=200)

In [59]:
rf.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [60]:
ypred = rf.predict(x_test)

In [62]:
f1_score(y_test,ypred,average='macro')

  'precision', 'predicted', average, warn_for)


0.2946832790810791

In [63]:
rf.fit(x,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [64]:
train_w_f = v.transform(test_df.text)

In [65]:
train_n = le.transform(test_df.drug)

In [67]:
train_n = train_n.reshape(2924,1)

In [68]:
newfeatures = hstack((train_w_f, train_n)).tocsr()

In [69]:
v = rf.predict(newfeatures)

In [75]:
k = pd.DataFrame({"unique_hash":test_df.unique_hash,"sentiment":v})

In [77]:
k.to_csv("sub_baseline_1_rf_tfidf.csv",index=False)