In [1]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import random
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Florian\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
test_path = "C:\\Users\\Florian\\Documents\\GitHub\\CovidSent\\Data\\train_data"
output_path = "C:\\Users\\Florian\\Documents\\GitHub\\CovidSent\\Data\\output"
file = "training.1600000.processed.noemoticon.csv"

final_file = "output.csv"
output_file = "output_sents.csv"

load = test_path + "\\" + file
to_clf = output_path + "\\" + final_file
out = output_path + "\\" + output_file

In [3]:
# p < 1 only for testing purposes!
# p = %of dataset used
p = 1
heads = ['sent', 'id', 'date', 'query', 'user', 'text']
dataframe = pd.read_csv(load, encoding = "ISO-8859-1", engine='python', names=heads,
                        skiprows=lambda i: i>0 and random.random() > p)

In [4]:
print(len(dataframe))

1600000


In [5]:
def tokenizer(text):
    return text.split()

In [6]:
count = CountVectorizer()

In [7]:
bag_of_words = count.fit_transform(dataframe.text.values)

In [8]:
# bag_of_words.toarray()

In [9]:
tfidf = TfidfTransformer(use_idf=True,
                         norm='l2',
                         smooth_idf=True)

In [10]:
# tfidf.fit_transform(bag_of_words).toarray()

In [11]:
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [12]:
X = dataframe['text']
y = dataframe['sent']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [13]:
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

In [14]:
def preprocessor(text):
    """ Return a cleaned version of text
    """
    # Remove HTML markup
    text = re.sub('<[^>]*>', '', text)
    # Save emoticons for later appending
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    # Remove any non-word character and append the emoticons,
    # removing the nose character for standarization. Convert to lower case
    text = (re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', ''))

    return text

In [15]:


param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop],
               'vect__tokenizer': [tokenizer_porter],
               'vect__preprocessor': [preprocessor],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop],
               'vect__tokenizer': [tokenizer_porter],
               'vect__preprocessor': [preprocessor],
               'vect__use_idf': [False],
               'vect__norm': [None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0]},
              ]


In [16]:

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0))])

In [17]:
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=2,
                           verbose=1,
                           n_jobs=-1)

In [18]:
tfidf.fit(X)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [19]:
X_train_tfif = tfidf.transform(X_train)
X_test_tfif = tfidf.transform(X_test)

In [20]:
# full batch training
X_train_tfif = tfidf.transform(X)
logreg = LogisticRegression(random_state=0, verbose=1, C=1.25, penalty= "l2", max_iter=75)
logreg.fit(X_train_tfif, y)  

[LibLinear]

LogisticRegression(C=1.25, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=75, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=1, warm_start=False)

In [21]:
target = pd.read_csv(to_clf)

In [22]:
target_x = tfidf.transform(target.text.values)

In [34]:
target_y = logreg.predict(target_x)
target_prob = logreg.predict_proba(target_x)

In [35]:
target["prediction"] = target_y


In [36]:
prob_to_be_pos = []
for i in target_prob:
    prob_to_be_pos.append(i[1])

In [37]:
logreg.classes_

array([0, 4], dtype=int64)

In [38]:
target["pos_proba"] = prob_to_be_pos

In [39]:
target["pos_proba"].round(4)

0        0.4441
1        0.3579
2        0.7889
3        0.8485
4        0.5445
5        0.6673
6        0.3675
7        0.7280
8        0.7727
9        0.9364
10       0.7856
11       0.6232
12       0.4096
13       0.9582
14       0.7207
15       0.9170
16       0.4268
17       0.8537
18       0.6070
19       0.6653
20       0.7324
21       0.8547
22       0.5516
23       0.7199
24       0.6431
25       0.8749
26       0.2505
27       0.4661
28       0.4821
29       0.5269
          ...  
18259    0.4621
18260    0.7619
18261    0.7851
18262    0.6262
18263    0.4170
18264    0.6641
18265    0.7554
18266    0.8337
18267    0.8337
18268    0.6811
18269    0.5448
18270    0.2716
18271    0.8871
18272    0.8240
18273    0.4178
18274    0.5537
18275    0.1377
18276    0.6325
18277    0.8875
18278    0.5542
18279    0.8172
18280    0.6673
18281    0.7701
18282    0.7370
18283    0.3310
18284    0.8686
18285    0.0408
18286    0.8899
18287    0.5874
18288    0.5055
Name: pos_proba, Length:

In [42]:
target.to_csv(out)

In [43]:
target.prediction.values

array([0, 0, 4, ..., 4, 4, 4], dtype=int64)