In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import re

In [25]:
raw_train_df = pd.read_csv("./data/train.csv")
raw_test_df = pd.read_csv("./data/test.csv")

In [26]:
def text_cleaner(text):
    text = text.lower() # convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # remove punctuation and special characters
    text = re.sub(r'\s+', ' ', text) # remove extra whitespace
    text = re.sub(r'https?://\S+', '', text) # remove URLs
    text = re.sub(r"#", "", text)
    return text

In [27]:
del raw_train_df['keyword']
del raw_train_df['location']

In [30]:
# Replaces the null values in the data with an empty string
train_data = raw_train_df.where((pd.notnull(raw_train_df)),'')
test_data = raw_test_df.where((pd.notnull(raw_test_df)),'')

train_data['text'] = train_data['text'].apply(text_cleaner)
test_data['text'] = test_data['text'].apply(text_cleaner)

In [36]:
X, Y = train_data['text'], train_data['target']

In [42]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state= 3)

print(X_train.shape, X_test.shape)

(6851,) (762,)


In [43]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase='True')

# Below we fit the vectorizer to the training data 
X_train_features = feature_extraction.fit_transform(X_train)
# not need to fit this time only transform based on the previous fit
X_test_features = feature_extraction.transform(X_test)

# convert all values of Y_train and Y_test to integers 
Y_train = Y_train.astype("int")
Y_test = Y_test.astype("int")

In [60]:
model = LogisticRegression()
# training the logistic regression model with the training data 
model.fit(X_train_features, Y_train)

LogisticRegression()

In [53]:
prediction_on_train_data = model.predict(X_train_features)
accuracy_on_train_data = accuracy_score(Y_train, prediction_on_train_data)

In [54]:
print(accuracy_on_train_data)

0.8933002481389578


In [47]:
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [48]:
print(accuracy_on_test_data)

0.8083989501312336


In [57]:
input_mail = ["For Legal and Medical Referral Service @1800_Injured Call us at: 1-800-465-87332 #accident #slipandfall #dogbite"]
input_feature_mail = feature_extraction.transform(input_mail)
print(model.predict(input_feature_mail))

[1]
