In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [2]:
from pandas import DataFrame, Series
import pandas as pd
import numpy as np
import os as os

In [3]:
DIM = 256

In [4]:
kh = pd.read_csv("../Input/know-how.csv", encoding="latin1")
kh.drop('Unnamed: 5', axis=1, inplace=True)
print(kh.shape)
print(kh.columns)

(32395, 5)
Index(['PersonId', 'KnowHowId', 'KnowHow', 'Category', 'Subcategory'], dtype='object')


In [5]:
k1 = kh[kh.Category == 'Technologies'].groupby('KnowHowId').count()
ranked = k1.sort_values('PersonId', ascending=False).head(DIM).index

In [6]:
technologies = kh[kh.Category == 'Technologies'][['KnowHowId', 'KnowHow']].copy()
technologies = technologies.drop_duplicates(subset=['KnowHowId'])
technologies = technologies[technologies['KnowHowId'].isin(ranked)]
technologies.set_index('KnowHowId')
technologies.head()

Unnamed: 0,KnowHowId,KnowHow
8,2882,Confluence
9,3072,Visual Basic for Applications (VBA)
10,3107,MS Windows
11,3201,Windows Server
12,3252,HTML


In [7]:
kh_tech = kh[kh.KnowHowId.isin(technologies.KnowHowId)].drop('Category',1).copy()

In [8]:
sentences = DataFrame(kh_tech.groupby('PersonId').apply(lambda x: list(set(x.KnowHow))), columns=['KnowHow'])
sentences.head()

Unnamed: 0_level_0,KnowHow
PersonId,Unnamed: 1_level_1
1,"[MySQL, MS Office, Active Directory, Lotus Not..."
3,"[MS-DOS, TCP/IP, MySQL, HTTP, Assembler, Windo..."
5,"[Java, HP ALM, MS Outlook, Pascal, Visual Basi..."
7,"[IBM Rational RequisitePro, Unix, MySQL, Perl,..."
8,"[Windows NT, HP ALM, HP Quality Center, Window..."


In [9]:
from gensim import corpora, models, similarities



In [10]:
dictionary = corpora.Dictionary(sentences['KnowHow'])
print(dictionary)

Dictionary(255 unique tokens: ['TeamCity', 'MS SQL Server', 'CSS3', 'MFC', 'MS SQL Server 2008']...)


In [11]:
bow_corpus = [dictionary.doc2bow(text) for text in sentences['KnowHow']]

In [12]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [13]:
lsi = models.LsiModel(bow_corpus, id2word=dictionary, num_topics=6)
corpus_lsi = lsi[bow_corpus]

In [14]:
print('ready')
file1 = "know_how_128.lda"

if (os.path.exists(file1)):
    lda = models.LdaModel.load(file1)
    print("lda model loaded")
else:
    print ("looong calculation begins")
    lda = models.LdaModel(bow_corpus, id2word=dictionary, num_topics=6, passes=100)
    lda.save(file1)
    print("lda model saved")
    
corpus_lda = lda[bow_corpus]

ready
lda model loaded


In [15]:
lda.print_topics()

[(0,
  '0.029*"Eclipse" + 0.027*"Java" + 0.027*"JUnit" + 0.025*"Hibernate" + 0.023*"Maven" + 0.022*"XML" + 0.021*"Spring" + 0.021*"MySQL" + 0.021*"Apache Tomcat" + 0.020*"JavaScript"'),
 (1,
  '0.055*"Java" + 0.055*"JavaScript" + 0.050*"MySQL" + 0.043*"PHP" + 0.039*"HTML" + 0.038*"CSS" + 0.037*"C#" + 0.028*"Eclipse" + 0.027*"Git" + 0.026*"MS Windows"'),
 (2,
  '0.064*"Windows XP" + 0.059*"Windows 7" + 0.051*"C++" + 0.047*"C" + 0.037*"Windows 2000" + 0.037*"Windows Vista" + 0.030*"TCP/IP" + 0.029*"C#" + 0.028*"Subversion (SVN)" + 0.026*"Visual Studio"'),
 (3,
  '0.042*"MS Office" + 0.040*"MS Windows" + 0.036*"MS Project" + 0.034*"SQL" + 0.033*"HP Quality Center" + 0.033*"Java" + 0.032*"Atlassian JIRA" + 0.030*"MS Access" + 0.029*"XML" + 0.029*"MS Visio"'),
 (4,
  '0.046*"C#" + 0.035*"WPF" + 0.032*"WCF" + 0.030*"Microsoft Team Foundation Server (TFS)" + 0.029*"MS SQL Server" + 0.028*"ASP.NET" + 0.024*".NET" + 0.023*"JavaScript" + 0.021*"MS Visual Studio 2010" + 0.021*"Visual Studio"'),
 

In [16]:
print('ready')
file2 = "know_how_128_tfidf.lda"

if (os.path.exists(file2)):
    lda_tfidf = models.LdaModel.load(file2)
    print("lda model loaded")
else:
    print ("looong calculation begins")
    lda_tfidf = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=6, passes=100)
    lda_tfidf.save(file2)
    print("lda model saved")
    
corpus_lda_tfidf = lda_tfidf[corpus_tfidf]

ready
lda model loaded


In [17]:
lda_tfidf.print_topics()

[(0,
  '0.022*"Windows XP" + 0.021*"C++" + 0.020*"C" + 0.020*"Windows 7" + 0.017*"Windows 2000" + 0.016*"TCP/IP" + 0.015*"Windows Vista" + 0.014*"MS Access" + 0.014*"Sparx Enterprise Architect" + 0.014*"Linux"'),
 (1,
  '0.028*"WPF" + 0.025*"WCF" + 0.025*"ASP.NET" + 0.020*"Microsoft Team Foundation Server (TFS)" + 0.019*"MS SQL Server" + 0.018*"ASP.NET MVC" + 0.018*"Entity Framework" + 0.018*"C#" + 0.018*".NET" + 0.018*"LINQ"'),
 (2,
  '0.099*"Internet" + 0.066*"Silverlight 4.0" + 0.058*"MongoDB" + 0.052*"C#/.NET" + 0.050*".NET Framework 3.5" + 0.029*"ASP.NET MVC" + 0.019*"Scripting languages (HTML, JavaScript, XML etc.)" + 0.017*"SOA" + 0.014*"AngularJS" + 0.010*"CSS3"'),
 (3,
  '0.050*"MS Office" + 0.049*"MS Excel" + 0.042*"MS Word" + 0.038*"MS SharePoint" + 0.035*"MS PowerPoint" + 0.034*"Lotus Notes" + 0.034*"MS Outlook" + 0.032*"SpiraTest" + 0.029*"MS Project" + 0.026*"Adobe Photoshop"'),
 (4,
  '0.018*"Hibernate" + 0.017*"Git" + 0.016*"Maven" + 0.016*"Spring" + 0.015*"JUnit" + 0.0

In [18]:
print('ready')
file3 = "know_how_128_hdp.lda"

if (os.path.exists(file3)):
    hdp = models.HdpModel.load(file3)
    print("Hdp model loaded")
else:
    print ("looong calculation begins")
    hdp = models.HdpModel(bow_corpus, id2word=dictionary)
    hdp.save(file3)
    print("Hdp model saved")

ready
Hdp model loaded


In [19]:
hdp.print_topics(6)

[(0,
  '0.028*XSD + 0.022*Windows Server + 0.018*Silverlight + 0.018*XMLSpy + 0.016*Swing + 0.016*Sparx Enterprise Architect + 0.015*Linux + 0.014*SQL + 0.013*Enterprise Architect + 0.013*jQuery'),
 (1,
  '0.021*POP3 + 0.018*RESTful Web Services + 0.017*Oracle 11g + 0.014*Visual Basic for Applications (VBA) + 0.014*Windows 2000 + 0.014*JDBC + 0.014*Groovy + 0.013*Visual Studio + 0.013*MVVM pattern + 0.012*GWT'),
 (2,
  '0.022*LaTeX + 0.017*iOS + 0.015*HP Quality Center + 0.014*CSS + 0.014*Struts + 0.014*JBoss Application Server + 0.014*HTML5 + 0.013*Crystal Reports + 0.013*CVS + 0.012*SSH'),
 (3,
  '0.021*Entity Framework + 0.019*XSLT + 0.017*Silverlight 4.0 + 0.017*HTTP + 0.016*MS Windows + 0.016*RESTful Web Services + 0.015*Wiki + 0.014*Java EE + 0.013*JBoss Application Server + 0.012*MS Office '),
 (4,
  '0.017*MS SQL Server 2000 + 0.016*Qt + 0.015*AJAX + 0.015*PHP Version 4 + 0.013*Assembler + 0.013*MongoDB + 0.012*Eclipse + 0.011*C# + 0.011*Struts + 0.011*JBoss Application Server'

In [20]:
print('ready')
file4 = "know_how_128.word2vec"

if (os.path.exists(file4)):
    wv = models.Word2Vec.load(file4)
    print("word2vec model loaded")
else:
    print ("looong calculation begins")
    wv = models.Word2Vec(sentences['KnowHow'], iter=2000)
    wv.save(file4)
    print("word2vec model saved")

ready
word2vec model loaded


In [21]:
wv.most_similar(positive=['Java'], negative=['C#'], topn=4)

[('XML', 0.5272793173789978),
 ('Perl', 0.43977513909339905),
 ('Confluence', 0.419683039188385),
 ('MS Project', 0.39706218242645264)]

In [22]:
wv.most_similar(positive=['C#'], negative=['Java'], topn=4)

[('C++', 0.4358682632446289),
 ('Windows 2000', 0.40301600098609924),
 ('Eclipse', 0.4011368155479431),
 ('WCF', 0.3830938935279846)]