In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.multiclass import OneVsRestClassifier

In [2]:
df = pd.read_csv('Book1.csv', index_col=0)
df.head()

Unnamed: 0,Text,Tags
2,aspnet site maps has anyone got experience cre...,"['sql', 'asp.net']"
4,adding scripting functionality to net applicat...,"['c#', '.net']"
5,should i use nested classes in this case i am ...,['c++']
6,homegrown consumption of web services i have b...,['.net']
8,automatically update version number i would li...,['c#']


In [3]:
import ast
df['Tags'] = df['Tags'].apply(lambda x: ast.literal_eval(x))

In [4]:
df.head()

Unnamed: 0,Text,Tags
2,aspnet site maps has anyone got experience cre...,"[sql, asp.net]"
4,adding scripting functionality to net applicat...,"[c#, .net]"
5,should i use nested classes in this case i am ...,[c++]
6,homegrown consumption of web services i have b...,[.net]
8,automatically update version number i would li...,[c#]


In [5]:
df['Tags'] = df['Tags'].apply(lambda x: " ".join(x))

In [6]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [7]:
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))

    return " ".join(y)

In [8]:
df['Text'] = df['Text'].apply(stem)

In [9]:
df['Tags']

2               sql asp.net
4                   c# .net
5                       c++
6                      .net
8                        c#
                ...        
186100               python
186108        ruby-on-rails
186110    javascript jquery
186126                  php
186131               jquery
Name: Tags, Length: 21047, dtype: object

In [10]:
df['Text'][2]

'aspnet site map ha anyon got experi creat sqlbase aspnet sitemap providersi have got the default xml file websitemap work properli with my menu and sitemappath control but i will need a way for the user of my site to creat and modifi page dynamicallyi need to tie page view permiss into the standard aspnet membership system as well'

In [11]:
df['Text'][4]

'ad script function to net applic i have a littl game written in c it use a databas as backend it is a trade card game and i want to implement the function of the card as a scriptwhat i mean is that i essenti have an interfac icard which a card class implement public class card056 icard and which contain function that are call by the gamenow to make the thing maintainablemodd i would like to have the class for each card as sourc code in the databas and essenti compil it on first use so when i have to addchang a card i will just add it to the databas and tell my applic to refresh without need ani assembl deploy especi sinc we would be talk about 1 assembl per card which mean hundr of assembliesi that possibl regist a class from a sourc file and then instanti it etcicard cardscurr new mygamecardlibrarycard056cardscurrentonenterplayref currentgamestateth languag is c but extra bonu if it is possibl to write the script in ani net languag'

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [13]:
vector = cv.fit_transform(df['Text']).toarray()

In [14]:
vector.shape

(21047, 5000)

In [15]:
cv.get_feature_names()

['00',
 '00f',
 '01',
 '010',
 '0126',
 '02',
 '03',
 '030',
 '04',
 '05',
 '0523',
 '0526',
 '06',
 '07',
 '08',
 '09',
 '092704',
 '0949',
 '0class',
 '0i',
 '0int',
 '0px',
 '0s',
 '0the',
 '0thi',
 '0what',
 '0x0',
 '0x02ab43f0',
 '0x10',
 '0xf',
 '0xff',
 '10',
 '100',
 '100k',
 '100px',
 '101',
 '1010',
 '101823139',
 '102',
 '1024',
 '103720682',
 '104',
 '105',
 '106',
 '10f',
 '10i',
 '10px',
 '11',
 '110',
 '112',
 '113007',
 '113448644',
 '116',
 '12',
 '120',
 '121',
 '123',
 '1234',
 '127',
 '127001',
 '128',
 '13',
 '130',
 '132',
 '133710507',
 '133710508',
 '133710509',
 '133710510',
 '133710511',
 '133710512',
 '133710513',
 '133710514',
 '133710515',
 '133833996',
 '133856841',
 '133906901',
 '14',
 '1400',
 '142',
 '15',
 '150',
 '152917803',
 '159',
 '16',
 '160',
 '162539360',
 '16bit',
 '17',
 '1732819',
 '18',
 '180',
 '187',
 '19',
 '192',
 '1a',
 '1i',
 '1px',
 '1s',
 '1st',
 '20',
 '200',
 '2003',
 '2005',
 '2007',
 '2008',
 '2009',
 '200px',
 '201',
 '2010',


In [16]:
df['Text'].shape

(21047,)

In [17]:
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
similarity = cosine_similarity(vector)

In [19]:
similarity

array([[1.        , 0.035007  , 0.01992048, ..., 0.        , 0.14147911,
        0.08219949],
       [0.035007  , 1.        , 0.34588866, ..., 0.0285831 , 0.22641187,
        0.09208185],
       [0.01992048, 0.34588866, 1.        , ..., 0.03659625, 0.43482828,
        0.13754606],
       ...,
       [0.        , 0.0285831 , 0.03659625, ..., 1.        , 0.        ,
        0.0503367 ],
       [0.14147911, 0.22641187, 0.43482828, ..., 0.        , 1.        ,
        0.02658174],
       [0.08219949, 0.09208185, 0.13754606, ..., 0.0503367 , 0.02658174,
        1.        ]])

In [20]:
df[df['Tags'] == 'sql asp.net']

Unnamed: 0,Text,Tags
2,aspnet site map ha anyon got experi creat sqlb...,sql asp.net


In [21]:
df = df.reset_index(drop=True)
df

Unnamed: 0,Text,Tags
0,aspnet site map ha anyon got experi creat sqlb...,sql asp.net
1,ad script function to net applic i have a litt...,c# .net
2,should i use nest class in thi case i am work ...,c++
3,homegrown consumpt of web servic i have been w...,.net
4,automat updat version number i would like the ...,c#
...,...,...
21042,use python subprocess call to invok python scr...,python
21043,rail 3 activerecord api build method i am fair...,ruby-on-rails
21044,how to move an entir div element up x pixel i ...,javascript jquery
21045,get class name from file i have a php file whi...,php


In [22]:
def recommend(Tag):
    y=[]
    index = df[df['Tags'] == Tag].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:10]:
        if df.iloc[i[0]].Tags not in (df.iloc[i[0]].Tags).split():
            y.append(df.iloc[i[0]].Tags)
    l = list(set(y))
    x = " ".join(l)
    k = x.split()
    if Tag not in set(k):
            print(Tag)
    for i in set(k):
        print(i)
        


In [23]:
recommend('python')

ruby
python
java


In [24]:
df['Tags'].unique()

array(['sql asp.net', 'c# .net', 'c++', '.net', 'c#', 'c# asp.net', 'sql',
       'html', 'c', 'asp.net', 'php mysql', 'ruby', 'java', 'php',
       'sql mysql', 'javascript', 'css', 'objective-c', '.net sql',
       'java .net', 'c++ python', 'java php', 'mysql', 'c++ c', 'python',
       'ruby-on-rails', 'c# sql', '.net asp.net', 'html css',
       'ruby-on-rails ruby', 'jquery', 'android', 'javascript html css',
       'php html', 'asp.net css', 'javascript jquery',
       'javascript python', 'c# java', 'iphone', 'asp.net .net',
       'css html', 'c# asp.net javascript', 'c# c++', '.net html',
       'php javascript', 'java javascript',
       'javascript iphone objective-c', 'asp.net javascript html',
       'javascript html', 'c# java python ruby', 'c# asp.net css',
       'c# java python', 'javascript jquery html',
       'c# c++ iphone objective-c', 'asp.net jquery', 'javascript iphone',
       'c# .net sql', 'java ruby-on-rails', 'sql ruby-on-rails',
       'java mysql', 'c# 

In [25]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'),-1)

In [27]:
# open a file, where you stored the pickled data
file = open('similarity.pkl', 'rb')

# dump information to that file
similarity = pickle.load(file)


In [28]:
pickle.dump(df,open('data.pkl','wb'),-1)
file = open('data.pkl', 'rb')

# dump information to that file
data = pickle.load(file)
data

Unnamed: 0,Text,Tags
0,aspnet site map ha anyon got experi creat sqlb...,sql asp.net
1,ad script function to net applic i have a litt...,c# .net
2,should i use nest class in thi case i am work ...,c++
3,homegrown consumpt of web servic i have been w...,.net
4,automat updat version number i would like the ...,c#
...,...,...
21042,use python subprocess call to invok python scr...,python
21043,rail 3 activerecord api build method i am fair...,ruby-on-rails
21044,how to move an entir div element up x pixel i ...,javascript jquery
21045,get class name from file i have a php file whi...,php
