In [52]:
import numpy as np
import pandas as pd
import re

# Reading in the dataset
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Clean the text data
def clean_text(texts):
    texts[:] = [text.lower() for text in texts]                                 # convert into lower letters
    texts[:] = [re.sub(r'https?:\/\/.*\/\w*', 'URL', text) for text in texts]   # http:// ... / word  or  http:// ... / word
    texts[:] = [re.sub(r'@\w+([-.]\w+)*', '', text) for text in texts]          # meaningless things like @AlexYan
    texts[:] = [re.sub(r'&\w+([-.]\w+)*', '', text) for text in texts]          # meaningless things like &amp

clean_text(train_df['text'])
clean_text(test_df['text'])

# print(train_df.head)
# no.       Before                                                  After
# 0         Our Deeds are the Reason of this #earthquake M...       our deeds are the reason of this #earthquake m...
# 7069      @aria_ahrary @TheTawniest The out of control w...       the out of control wild fires in california ...

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  texts[:] = [text.lower() for text in texts]                                 # convert into lower letters
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  texts[:] = [re.sub(r'https?:\/\/.*\/\w*', 'URL', text) for text in texts]   # http:// ... / word  or  http:// ... / word
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  texts[:] = [re.sub(r'@\w+([-.]\w+)*', '', text) for text in texts]          # meaningless things like @AlexYan
A value is trying to be set on a copy o

In [54]:
# First attemp: using sklearn.feature_extraction.text.CountVectorizer()
# This converts a collection of text documents to a matrix of token counts,
# and produces a sparse representation of the counts using scipy.sparse.csr_matrix.

# It produces 14717 features, corresponding to the 14717 distinct words in training data.
# However, the number of features is too large to us, considering our training sample size is also large,
# which slows down the following model training.
# Instead, we decide to use a bert model. 

# from sklearn import feature_extraction
# count_vectorizer = feature_extraction.text.CountVectorizer()
# X_train = count_vectorizer.fit_transform(train_df["text"])
# X_test = count_vectorizer.transform(test_df["text"])

# print(type(train_vectors))                    # 'scipy.sparse.csr.csr_matrix'
# print(train_vectors.shape)                    # (7613, 14717): 7613 training samples, 14717 features
# train_array = train_vectors.toarray()
# print(train_array)



# Second attemp: using a pre-trained bert model
# It produces 384 features.
from sentence_transformers import SentenceTransformer
bert_model = SentenceTransformer('all-MiniLM-L12-v1')
X_train = bert_model.encode(train_df["text"])
X_test = bert_model.encode(test_df["text"])

In [58]:
print(X_train.shape)     # (7613, 384): 7613 training samples, 384 features
print(X_train)
print(X_test.shape)      # (3263, 384): 3263 training samples, 384 features
print(X_test)

(7613, 384)
[[ 0.00230695  0.07191736  0.09788872 ...  0.07197042 -0.02740236
  -0.07827645]
 [ 0.02561178  0.06068205 -0.0192754  ... -0.02490251 -0.05799778
  -0.00681885]
 [ 0.11690209  0.04465307  0.08812291 ... -0.01944826 -0.03323479
   0.02873739]
 ...
 [-0.00110018 -0.00252272  0.06226555 ...  0.04964388 -0.04241561
   0.04092398]
 [-0.04007943  0.06662016  0.0298096  ...  0.07253803 -0.0047584
   0.03727422]
 [ 0.06300642 -0.00216898  0.05352418 ...  0.01271525  0.01957589
   0.05859178]]
(3263, 384)
[[ 1.5842754e-03 -6.8736620e-02  3.9622501e-02 ...  2.9520888e-02
   3.6666408e-02  3.1300120e-02]
 [ 1.2380565e-01 -6.9417410e-02  8.9123756e-02 ...  3.4361023e-02
  -3.5430912e-05  1.1519138e-02]
 [ 1.5148669e-01  6.8411618e-03 -1.6904498e-02 ... -4.8215881e-02
  -7.4761413e-02  6.8294503e-02]
 ...
 [-6.3210293e-03  2.7175112e-02 -1.4542190e-02 ... -3.1739313e-02
  -2.7545594e-02  6.9203213e-02]
 [ 9.5806429e-03 -1.5488052e-02  5.4098818e-02 ... -6.7440167e-02
  -1.1712046e-01  

In [59]:
# Split the training set into training and validation set
from sklearn.model_selection import train_test_split
y_label = train_df['target']
X_train, X_val, y_train, y_val = train_test_split(X_train, y_label, test_size=0.2, random_state=100000)

In [61]:
# Fit the logistic regression model on the training set
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1000, max_iter=1000)
logreg.fit(X_train, y_train)
# Find the predicted values on the validation set
y_hat_logreg = logreg.predict(X_val)
# Find the accuracy achieved on the validation set
acc_logreg = np.mean(y_hat_logreg == y_val)

In [68]:
# Find Precision, recall and fscore on the validation set
from sklearn.metrics import precision_recall_fscore_support
prec, recal, fscore, _ = precision_recall_fscore_support(y_val, y_hat_logreg, average='binary')
print('prec: ', prec)
print('recal: ', recal)
print('fscore: ', fscore)

prec:  0.8030560271646859
recal:  0.7243491577335375
fscore:  0.7616747181964573


In [65]:
# Read in the sample_submission.csv file
# Preict
submission = pd.read_csv("sample_submission.csv")
submission["target"] = logreg.predict(X_test)
submission.head()
submission.to_csv("submission.csv", index=False)