In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Training it on combined keyword + tweet

In [2]:
#training data
train = pd.read_csv('/Users/yunjuha/Downloads/nlp-getting-started/train.csv')
train_df = pd.DataFrame(train)
train_df = train_df.dropna(subset=['keyword'])

selected_columns = ['keyword', 'text', 'target']
train_df = train_df[selected_columns]
train_df['combined'] = train_df['keyword'] + " " + train_df['text']
train_df.drop(['keyword', 'text'], axis=1, inplace=True)

#test data
test = pd.read_csv('/Users/yunjuha/Downloads/nlp-getting-started/test.csv')
test_df = pd.DataFrame(test)
test_df = test_df.dropna(subset=['keyword'])

selected_columns = ['keyword', 'text']
test_df = test_df[selected_columns]
test_df['combined'] = test_df['keyword'] + " " + test_df['text']
test_df.drop(['keyword', 'text'], axis=1, inplace=True)
test_df['target'] = ""

In [3]:
#TRAINING MODEL

X_combined = train_df['combined']
y = train_df['target']

X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

#TF-IDF vectorization on combined text
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

clf = LogisticRegression()
clf.fit(X_train_tfidf, y_train)

y_pred = clf.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.8047650562541363
Precision: 0.7888513513513513
Recall: 0.7331240188383046
F1 Score: 0.7599674532139951
Confusion Matrix:
[[749 125]
 [170 467]]


In [4]:
#TESTING MODEL

X_test = test_df['combined']
y_test = test_df['target']

X_test_tfidf = tfidf_vectorizer.transform(X_test)

y_pred = clf.predict(X_test_tfidf)


test_df['target'] = y_pred
combined_test_df = test_df

combined_test_df

Unnamed: 0,combined,target
15,ablaze Birmingham Wholesale Market is ablaze B...,0
16,ablaze @sunkxssedharry will you wear shorts fo...,0
17,ablaze #PreviouslyOnDoyinTv: Toke MakinwaÛªs ...,1
18,ablaze Check these out: http://t.co/rOI2NSmEJJ...,0
19,ablaze PSA: IÛªm splitting my personalities.\...,0
...,...,...
3247,wrecked RT CNBC '3 words from Disney CEO Bob I...,0
3248,wrecked Smackdown tyme this should put me in a...,0
3249,wrecked @thrillhho jsyk I haven't stopped thin...,0
3250,wrecked @stighefootball Begovic has been garba...,0


# Separating keyword and tweet

In [5]:
#training data
train = pd.read_csv('/Users/yunjuha/Downloads/nlp-getting-started/train.csv')
train_df = pd.DataFrame(train)
train_df = train_df.dropna(subset=['keyword'])

#test data
test = pd.read_csv('/Users/yunjuha/Downloads/nlp-getting-started/test.csv')
test_df = pd.DataFrame(test)
test_df = test_df.dropna(subset=['keyword'])
test_df['target'] = ""

In [6]:
#Training

X = train_df[['keyword', 'text']]
y = train_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tfidf_vectorizer_keyword = TfidfVectorizer()
tfidf_vectorizer_tweet = TfidfVectorizer()

X_train_keyword_tfidf = tfidf_vectorizer_keyword.fit_transform(X_train['keyword'])
X_train_tweet_tfidf = tfidf_vectorizer_tweet.fit_transform(X_train['text'])

import scipy.sparse

X_train_combined_tfidf = scipy.sparse.hstack((X_train_keyword_tfidf, X_train_tweet_tfidf))

clf = LogisticRegression()
clf.fit(X_train_combined_tfidf, y_train)

X_test_keyword_tfidf = tfidf_vectorizer_keyword.transform(X_test['keyword'])
X_test_tweet_tfidf = tfidf_vectorizer_tweet.transform(X_test['text'])

X_test_combined_tfidf = scipy.sparse.hstack((X_test_keyword_tfidf, X_test_tweet_tfidf))

y_pred = clf.predict(X_test_combined_tfidf)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.7921906022501655
Precision: 0.7609046849757674
Recall: 0.7394034536891679
F1 Score: 0.75
Confusion Matrix:
[[726 148]
 [166 471]]


In [7]:
#TESTING MODEL

X_test_keyword_tfidf = tfidf_vectorizer_keyword.transform(test_df['keyword'])
X_test_tweet_tfidf = tfidf_vectorizer_tweet.transform(test_df['text'])

import scipy.sparse

X_test_combined_tfidf = scipy.sparse.hstack((X_test_keyword_tfidf, X_test_tweet_tfidf))

y_pred = clf.predict(X_test_combined_tfidf)

test_df['target'] = y_pred
sep_test_df = test_df
sep_test_df

Unnamed: 0,id,keyword,location,text,target
15,46,ablaze,London,Birmingham Wholesale Market is ablaze BBC News...,0
16,47,ablaze,Niall's place | SAF 12 SQUAD |,@sunkxssedharry will you wear shorts for race ...,0
17,51,ablaze,NIGERIA,#PreviouslyOnDoyinTv: Toke MakinwaÛªs marriag...,1
18,58,ablaze,Live On Webcam,Check these out: http://t.co/rOI2NSmEJJ http:/...,0
19,60,ablaze,"Los Angeles, Califnordia",PSA: IÛªm splitting my personalities.\n\n?? t...,0
...,...,...,...,...,...
3247,10806,wrecked,Seattle Washington,RT CNBC '3 words from Disney CEO Bob Iger wrec...,0
3248,10807,wrecked,Acey mountain islanddåÇTorontoåÈ,Smackdown tyme this should put me in a good mo...,0
3249,10816,wrecked,los angeles,@thrillhho jsyk I haven't stopped thinking abt...,0
3250,10820,wrecked,"Brussels, Belgium",@stighefootball Begovic has been garbage. He g...,0


# Differences between combining keyword and text vs separating

In [8]:
differences = (combined_test_df['target'] != sep_test_df['target']).sum()

print("Number of differences:", differences)

Number of differences: 152
