In [28]:
# Read in previously generated data
def retrieve_subframes():
    '''
    Gets partitioned dataframes from source
    :return: list of dataframes
    '''
    subframes = []
    for i in range(0, 8):
        subframes.append(pd.read_csv('../data_inspection/generated_and_cleaned/it_' + str(i) + '.csv'))
    return subframes

In [29]:
def drop_unnamed(subframes):
    for df in subframes:
        if 'Unnamed: 0' in df.columns:
            df.drop(columns=['Unnamed: 0'], inplace=True)
    return subframes

In [30]:
def create_all_sentence_set(merged_df):
    df = pd.DataFrame()
    df['text'] = pd.concat([merged_df['question'], merged_df['answers']], ignore_index=True)

    # Add 'is_question' column
    df['is_question'] = [1] * len(merged_df['question']) + [0] * len(merged_df['answers'])
    return df

In [31]:
from nltk import word_tokenize
## Stemmer
# Tokenization and stemming
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()


def tokenize_and_stem(text):
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

In [32]:
from sklearn.model_selection import train_test_split


def get_train_test(df):
    # reset index, create train + Test splits
    df.reset_index(drop=True, inplace=True)
    return train_test_split(df['text'], df['is_question'], test_size=0.2, random_state=7)

In [33]:
def halve_df(a_df):
    midpoint = len(a_df) // 2
    subframes = [a_df.iloc[i * midpoint: (i + 1) * midpoint] for i in range(2)]
    return subframes

In [34]:
def offest_df(incorrect):
    # Get last value
    final_val = incorrect['answers'].iloc[-1]
    print(final_val)

    # Shift values in the 'y' column down by one row
    incorrect['answers'] = incorrect['answers'].shift(1)

    # Replace the last value of 'y' at position 0
    incorrect.iloc[0, incorrect.columns.get_loc('answers')] = final_val
    return incorrect

In [35]:
def create_corpus(correct, incorrect):
    corpus = pd.DataFrame()
    corpus['text'] = pd.concat([correct['question'], correct['answers'], incorrect['question'], incorrect['answers']],
                               ignore_index=True)
    corpus.reset_index(drop=True, inplace=True)
    return corpus

In [36]:
def find_reference_index(str_val, corpus):
    for index, row in corpus.iterrows():
        if row['text'] == str_val:
            return index
    return -1

In [37]:
def calculate_cosign_similarity(index_a, index_b, sim_matrix):
    return sim_matrix[index_a, index_b]

In [38]:
import pandas as pd

# Data Acquisition

In [39]:
# Validate subframe lengths
subframes = retrieve_subframes()
for df in subframes:
    print(len(df))

500
500
500
500
500
500
500
500


In [40]:
# For each of these dataframes, drop the 'Unnamed: 0' column if it exists
# Drop the 'Unnamed: 0' column if it exists
subframes = drop_unnamed(subframes)

In [41]:
# Validate
len(subframes)

8

In [42]:
# Concatenate all DataFrames vertically -> Defined behvior
merged_df = pd.concat(subframes, ignore_index=True)

In [43]:
merged_df

Unnamed: 0,question,answers
0,who was the trump ocean club international hot...,the trump ocean club international hotel and t...
1,where was sasha vujačić born,sasha vujačić was born in maribor slovenia
2,what is a region that dead combo was released in,dead combo was released in portugal
3,what is a film directed by wiebke von carolsfeld,marion bridge
4,what country was music for stock exchange rel...,the united states
...,...,...
3995,what horror films came out in 2002,the ring and 28 days later were popular horror...
3996,what language is with the angels composed in,the language of the angels is composed in cele...
3997,what is the release typ eof the album get in t...,the release type of the album get in the ring ...
3998,what is a commune in the country of italy,a commune in italy is a basic local administra...


In [44]:
# Create sentence set
df = create_all_sentence_set(merged_df)

In [45]:
# Randomly Shuffle -> avoid adjacencies
df = df.sample(frac=1, random_state=7)

In [46]:
# validate
df

Unnamed: 0,text,is_question
3641,what is the legal structure of tellus leads,1
5086,romeo and juliet,0
1657,who was born in krasnodar,1
6842,the greatest showman,0
788,what tract is released by derek sherinian,1
...,...,...
5699,the track list from the release titled lounge ...,0
2550,what type of album is the album milestones,1
537,what company published the game bejeweled 2 de...,1
1220,what country is john viener a citizen of,1


# Begin TF-IDF Model

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Create Vectorizer
tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_and_stem)

In [48]:
# Get train and testing splits
X_train, X_test, y_train, y_test = get_train_test(df)

In [49]:
len(X_train)

6400

In [50]:
len(X_test)

1600

In [51]:
# Fit and transform on training data, transform test data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)



In [52]:
from sklearn.linear_model import LogisticRegression

# Init model
lr_classifier = LogisticRegression()

In [53]:
# Train the model
lr_classifier.fit(X_train_tfidf, y_train)

In [54]:
# Predictions
y_pred_lr = lr_classifier.predict(X_test_tfidf)

In [55]:
from sklearn.metrics import accuracy_score, classification_report

# Evaluation
print("TF-IDF + Logistic Regression Classifier with Tokenization and Stemming for Identifying Questions:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr))

TF-IDF + Logistic Regression Classifier with Tokenization and Stemming for Identifying Questions:
Accuracy: 0.97875
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98       796
           1       0.98      0.98      0.98       804

    accuracy                           0.98      1600
   macro avg       0.98      0.98      0.98      1600
weighted avg       0.98      0.98      0.98      1600


In [56]:
print(type(y_test))
print(type(y_pred_lr))

<class 'pandas.core.series.Series'>
<class 'numpy.ndarray'>


# Inspect Error Cases

In [57]:
error_analysis_df = pd.DataFrame({'phrase': X_test, 'actual': y_test, 'predicted': y_pred_lr})

In [58]:
errors = error_analysis_df[error_analysis_df['actual'] != error_analysis_df['predicted']]

In [59]:
errors[errors['actual'] == 0]

Unnamed: 0,phrase,actual,predicted
6339,gridiron is a type of sports book,0,1
6661,lebanon is in the eastern european time zone e...,0,1
4539,the name of the song is the collection,0,1
3582,one famous person who died from pneumonia was ...,0,1
5980,westlifes studio album is named westlife,0,1
799,now thats what i call music 18 was released in...,0,1
8,the developer who designed asterix and the gre...,0,1
6898,now thats what i call music,0,1
3547,people who are exposed to extremely high tempe...,0,1
7321,meitner is named after the austrian physicist ...,0,1


In [60]:
errors[errors['actual'] == 1]

Unnamed: 0,phrase,actual,predicted
3626,is punk rock rarities a album or dvd,1,0
6239,how is the drug nefazodone hydrochloride admi...,1,0
3331,is nightsongs a drama or comedy,1,0
4483,name a 2002 indian tamil romantic biographical...,1,0
4133,how is venoforce i dosed,1,0
491,is feel the noise of liar liar considered a drama,1,0
2467,steve hackett wrote this song,1,0
6589,how is a bupivacaine hydrochloride 5 injectabl...,1,0
1329,is one to one pop music or rock,1,0
757,name a year the boston celtics won the nba fin...,1,0


In [61]:
# Overall Analysis: Pretty Good-> No false negatives which is good for my use case
# Errors are intensely skewed to sentences using "question words"
# Still - This is very good

In [62]:
# Save Model
import joblib

joblib.dump(lr_classifier, '../models/q_or_no.pkl')
joblib.dump(tfidf_vectorizer, '../models/q_or_a_vectorizer.pkl')

['../models/q_or_a_vectorizer.pkl']

In [63]:
# Visualizations