In [0]:
import sys
from utils import *
import numpy as np
import pandas as pd
import pickle
import re
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
import warnings
warnings.filterwarnings('ignore')

In [0]:
!pwd

/content


In [2]:
% cd /content/drive/My\ Drive/Colab Notebooks/Natural Language Processing/Project/NLP_Project

/content/drive/My Drive/Colab Notebooks/Natural Language Processing/Project/NLP_Project


In [0]:
!ls

common		       main_bot.py		tfidf_vectorizer.pkl
data		       __pycache__		thread_embeddings_by_tags
db.sqlite3	       setup_google_colab.py	utils.py
dialogue_manager.py    starspace_embedding.tsv	week5-project.ipynb
intent_recognizer.pkl  tag_classifier.pkl


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
def tfidf_features(X_train,X_test,vectorizer_path):
    """Performs TF-IDF transformation and dumps the model."""
    
    # Train a vectorizer on X_train data.
    # Transform X_train and X_test data.
    
    # Pickle the trained vectorizer to 'vectorizer_path'
    # Don't forget to open the file in writing bytes mode.
    
    tfidf_vectorizer = TfidfVectorizer(ngram_range = (1,2) , min_df = 5,max_df =0.9,token_pattern=r'\S+' )
    tfidf_vectorizer.fit(X_train)
    X_train = tfidf_vectorizer.transform(X_train)
    X_test = tfidf_vectorizer.transform(X_test)

    file_Name = vectorizer_path
    fileObject = open(file_Name,'wb') 

    # this writes the object a to the
    # file named 'testfile'
    pickle.dump(tfidf_vectorizer,fileObject)   

    # here we close the fileObject
    fileObject.close()
    
    return X_train, X_test
    
    return X_train, X_test

In [0]:
sample_size = 200000
dialogue_df = pd.read_csv('data/dialogues.tsv', sep='\t').sample(sample_size, random_state=0)
stackoverflow_df = pd.read_csv('data/tagged_posts.tsv', sep='\t').sample(sample_size, random_state=0)

In [0]:
dialogue_df.head()

Unnamed: 0,text,tag
82925,"Donna, you are a muffin.",dialogue
48774,He was here last night till about two o'clock....,dialogue
55394,"All right, then make an appointment with her s...",dialogue
90806,"Hey, what is this-an interview? We're supposed...",dialogue
107758,Yeah. He's just a friend of mine I was trying ...,dialogue


In [0]:
stackoverflow_df.head()

Unnamed: 0,post_id,title,tag
2168983,43837842,Efficient Algorithm to compose valid expressio...,python
1084095,15747223,Why does this basic thread program fail with C...,c_cpp
1049020,15189594,Link to scroll to top not working,javascript
200466,3273927,Is it possible to implement ping on windows ph...,c#
1200249,17684551,GLSL normal mapping issue,c_cpp


In [0]:
dialogue_df['text'] = [text_prepare(x) for x in list(dialogue_df['text'])]
stackoverflow_df['title'] = [text_prepare(x) for x in list(stackoverflow_df['title'])]

In [0]:
################## Intent Recognition ######################3

In [0]:
# Concatenating test and train data and splitting for training and testing

X = np.concatenate([dialogue_df['text'].values, stackoverflow_df['title'].values])
y = ['dialogue'] * dialogue_df.shape[0] + ['stackoverflow'] * stackoverflow_df.shape[0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
print('Train size = {}, test size = {}'.format(len(X_train), len(X_test)))

X_train_tfidf, X_test_tfidf = tfidf_features(X_train, X_test, RESOURCE_PATH['TFIDF_VECTORIZER'])

Train size = 360000, test size = 40000


In [0]:
# Training a logistic regression classifier for binary classification

intent_recognizer = LogisticRegression(penalty='l2', C=10, random_state=0)
intent_recognizer.fit(X_train_tfidf, y_train)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
# Check test accuracy.
y_test_pred = intent_recognizer.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print('Test accuracy = {}'.format(test_accuracy))

Test accuracy = 0.9916


In [0]:
pickle.dump(intent_recognizer, open(RESOURCE_PATH['INTENT_RECOGNIZER'], 'wb'))

In [0]:
###### Programming Language Classification ################

In [0]:
X = stackoverflow_df['title'].values
y = stackoverflow_df['tag'].values

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print('Train size = {}, test size = {}'.format(len(X_train), len(X_test)))

Train size = 160000, test size = 40000


In [0]:
vectorizer = pickle.load(open(RESOURCE_PATH['TFIDF_VECTORIZER'], 'rb'))

X_train_tfidf, X_test_tfidf = vectorizer.transform(X_train), vectorizer.transform(X_test)

In [0]:
tag_classifier = OneVsRestClassifier(LogisticRegression(penalty='l2', C=5, random_state=0))
tag_classifier.fit(X_train_tfidf, y_train)

OneVsRestClassifier(estimator=LogisticRegression(C=5, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=0, solver='lbfgs',
                                                 tol=0.0001, verbose=0,
                                                 warm_start=False),
                    n_jobs=None)

In [0]:
# Check test accuracy.
y_test_pred = tag_classifier.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print('Test accuracy = {}'.format(test_accuracy))

Test accuracy = 0.80005


In [0]:
pickle.dump(tag_classifier, open(RESOURCE_PATH['TAG_CLASSIFIER'], 'wb'))

In [0]:
#############################################################################

In [0]:
########## Ranking questions with embeddings ##############

In [0]:
starspace_embeddings, embeddings_dim = load_embeddings('starspace_embedding.tsv')

In [0]:
posts_df = pd.read_csv('data/tagged_posts.tsv', sep='\t')

In [0]:
posts_df.head()

Unnamed: 0,post_id,title,tag
0,9,Calculate age in C#,c#
1,16,Filling a DataSet or DataTable from a LINQ que...,c#
2,39,Reliable timer in a console application,c#
3,42,Best way to allow plugins for a PHP application,php
4,59,"How do I get a distinct, ordered list of names...",c#


In [0]:
counts_by_tag = posts_df.groupby('tag').count()['title']

In [0]:
counts_by_tag

tag
c#            394451
c_cpp         281300
java          383456
javascript    375867
php           321752
python        208607
r              36359
ruby           99930
swift          34809
vb             35044
Name: title, dtype: int64

In [0]:
import os
os.makedirs(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], exist_ok=True)

for tag, count in counts_by_tag.items():
    tag_posts = posts_df[posts_df['tag'] == tag]
    
    tag_post_ids = tag_posts['post_id'].values
    
    tag_vectors = np.zeros((count, embeddings_dim), dtype=np.float32)
    for i, title in enumerate(tag_posts['title']):
        tag_vectors[i, :] = question_to_vec(title, starspace_embeddings, embeddings_dim)

    # Dump post ids and vectors to a file.
    filename = os.path.join(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], os.path.normpath('%s.pkl' % tag))
    pickle.dump((tag_post_ids, tag_vectors), open(filename, 'wb'))

In [0]:
!ls thread_embeddings_by_tags

c_cpp.pkl  java.pkl	   php.pkl     r.pkl	 swift.pkl
c#.pkl	   javascript.pkl  python.pkl  ruby.pkl  vb.pkl


In [3]:
! python main_bot.py --token=1122603454:AAEcgaY57DTwTeeqmrqpZdeQjq9VZDbjquI

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Loading resources...
Building encoder and decoder ...
Models built and ready to go!
Ready to talk!
An update received.
Update content: {'update_id': 21406700, 'message': {'message_id': 100, 'from': {'id': 940187485, 'is_bot': False, 'first_name': 'Aman', 'last_name': 'Choudhary', 'language_code': 'en'}, 'chat': {'id': 940187485, 'first_name': 'Aman', 'last_name': 'Choudhary', 'type': 'private'}, 'date': 1591176135, 'text': 'hi'}}
An update received.
Update content: {'update_id': 21406701, 'message': {'message_id': 102, 'from': {'id': 940187485, 'is_bot': False, 'first_name': 'Aman', 'last_name': 'Choudhary', 'language_code': 'en'}, 'chat': {'id': 940187485, 'first_name': 'Aman', 'last_name': 'Choudhary', 'type': 'private'}, 'date': 1591191882, 'text': 'how are you'}}
An update received.
Update content: {'update_id': 21406702, 'message': {'message_id': 104, 'from': {'id': 94018