### DSPT6 - Adding Data Science to a Web Application

The purpose of this notebook is to demonstrate:
- Simple online analysis of data from a user of the Twitoff app or an API
- Train a more complicated offline model, and serialize the results for online use

In [10]:
import sqlite3
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [3]:
# Connect to sqlite database
conn = sqlite3.connect("../twitoff/twtioff.sqlite")

In [4]:
def get_data(query, conn):
    '''Function to get data from SQLite DB'''
    
    cursor = conn.cursor()
    result = cursor.execute(query).fetchall()

    # Get columns from cursor object
    columns = list(map(lambda x: x[0], cursor.description))

    # Assign to DataFrame
    df = pd.DataFrame(data=result, columns=columns)
    return df

In [None]:
query = '''
SELECT
    tweet.id,
    tweet.text,
    tweet.embedding,
    tweet.username
From tweet
Join user On tweet.user_id= user.id;
'''

df = get_data(query, conn)
df['embedding_decoded'] = df.embedding.apply(lambda x:pickle.loads(x[2:]))
print(df.shape)
df.head()

In [None]:
pickle.loads(df.iloc[0].embedding[2:])

In [None]:
df.username.value_counts()

In [None]:
user1_embeddings = df.embedding_decoded[df.username== 'elonmusk']
user2_embeddings = df.embedding_decoded[df.username== 'nasa']
embeddings = pd.concat([user1_embeddings, user2_embeddings])

embeddings_df = pd.DataFrame(embeddings.tolist(),
                             columns =[f'dim{i}' for i in range(768)]))
labels = np.concatenate([np.ones(len(user1_embeddings)),
                        np.zeros(len(user2_embeddings))])
print(embeddings_df.shape, labels.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
embeddings_df, labels, test_size=.25,random_state=42)

print(X_train.shape, X_test.shape)

In [None]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, _train)

In [None]:
y_pred = lr.predict(X_test)
print(classification_report(y_test, y_pred))

plot_confusion_matrix(lr, X_test, y_test, cmap='Blues')
plt.title('LogReg Confusion Matrix');

In [None]:
pickel.dump(lr, open('../models/logreg.pkl', 'rb'))

In [None]:
lr_unpickled = pickel.load(lr, open('../models/logreg.pkl', 'rb'))
lr_unpickled

In [None]:
BASILICA_KEY = 'af5cd08a-90dc-8bd2-b773-b69f239bb670'
BASILICA = basilica.Connection(BASILICA_KEY)

In [None]:
example_embedding = BASILICA.embed_sentence("The MARS rover just reported new and interesting data!", model ='twitter')

In [None]:
lr_unpickled.predict_proba([example_embedding])[0]