In [1]:
import logging
import pandas as pd
import numpy as np
from numpy import random
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
import pickle

%matplotlib inline

In [2]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text

In [3]:
def train_pkl(save_to="pkls/model.pkl"):
    df = pd.read_csv('training_data.csv')
    df = df[pd.notnull(df['tags'])]
    df['post'] = df['post'].apply(clean_text)
    print(df.head())
    X = df.post
    y = df.tags
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)
    sgd = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
                   ])
    sgd.fit(X_train, y_train)
    model = open(save_to, 'wb')
    pickle.dump(sgd, model)
    model.close()
    return 
train_pkl()

                                                post         tags
0  causing behavior c# datetime type test public ...           c#
1  dynamic html load iframe aspnet 40 site users ...      asp.net
2  convert float value minsec trying convert seco...  objective-c
3  net framework 4 redistributable wondering get ...         .net
4  trying calculate print mean returning rather n...       python


In [4]:
def load_model(source="pkls/model.pkl"):
    pkl = open(source, 'rb')
    model = pickle.load(pkl)
    pkl.close()
    return model
model = load_model()

In [7]:
def predict_tag(text):
    text = clean_text(text)
    print(text)
    return model.predict([text])
predict_tag("php php php 090934324  #($@$*%@*#%@(#$# php")

php php php 090934324 # # ## php


array(['php'], dtype='<U13')