In [3]:
import re
import sys
import pandas as pd
import numpy as np
from __future__ import print_function
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split

In [4]:
df = pd.read_csv("repos_cats.csv", encoding="utf-8")

In [5]:
df.dropna(axis=0, subset=["description", "categories2", "language"], inplace=True)

In [6]:
df

Unnamed: 0,description,full_name,language,homepage,fork,categories,repo_url,source_repo,repo,items_count,repo_name,source_name,categories2,categories3
0,Paging view controller with customizable menu ...,kitasuke/PagingMenuController,Swift,,False,"Preloader.Ophiuchus, PagingMenuController",https://github.com/kitasuke/PagingMenuController,vsouza/awesome-ios,kitasuke/PagingMenuController,1447,PagingMenuController,awesome-ios,Preloader.Ophiuchus,"awesome-ios, Preloader.Ophiuchus"
1,Documentation on building a HTTPS stack in AWS...,jvehent/haproxy-aws,Shell,,False,"Miscellaneous Repos, Community Repos:, jvehent...",https://github.com/jvehent/haproxy-aws,donnemartin/awesome-aws,jvehent/haproxy-aws,316,haproxy-aws,awesome-aws,"Miscellaneous Repos, Community Repos:","awesome-aws, Miscellaneous Repos, Community Re..."
2,Custom UIButton effect inspired by Google Mate...,zoonooz/ZFRippleButton,Swift,,False,"SwiftMessages, ZFRippleButton",https://github.com/zoonooz/ZFRippleButton,vsouza/awesome-ios,zoonooz/ZFRippleButton,1447,ZFRippleButton,awesome-ios,SwiftMessages,"awesome-ios, SwiftMessages"
3,A Swift event bus for UIWebView/WKWebView and JS.,coshx/caravel,Swift,http://coshx.github.io/caravel/,False,"EventBus, Caravel",https://github.com/coshx/caravel,vsouza/awesome-ios,coshx/caravel,1447,caravel,awesome-ios,EventBus,"awesome-ios, EventBus"
4,ROAD – Rapid Objective-C Applications Development,epam/road-ios-framework,Objective-C,,False,"Networking, ROADFramework",https://github.com/epam/road-ios-framework,vsouza/awesome-ios,epam/road-ios-framework,1447,road-ios-framework,awesome-ios,Networking,"awesome-ios, Networking"
5,Job scheduling made easy.,carlescere/scheduler,Go,,False,"Utilities, scheduler",https://github.com/carlescere/scheduler,avelino/awesome-go,carlescere/scheduler,776,scheduler,awesome-go,Utilities,"awesome-go, Utilities"
6,iOS & OSX Bluetooth library for RxSwift,Polidea/RxBluetoothKit,Swift,,False,"Reactive Programming, RxBluetoothKit",https://github.com/Polidea/RxBluetoothKit,vsouza/awesome-ios,Polidea/RxBluetoothKit,1447,RxBluetoothKit,awesome-ios,Reactive Programming,"awesome-ios, Reactive Programming"
7,iOS & OSX Bluetooth library for RxSwift,Polidea/RxBluetoothKit,Swift,,False,"Reactive Programming, RxBluetoothKit",https://github.com/Polidea/RxBluetoothKit,vsouza/awesome-ios,Polidea/RxBluetoothKit,1447,RxBluetoothKit,awesome-ios,Reactive Programming,"awesome-ios, Reactive Programming"
8,Extended StyleSheets for React Native,vitalets/react-native-extended-stylesheet,JavaScript,,False,"Styling, react-native-extended-stylesheet ★253",https://github.com/vitalets/react-native-exten...,jondot/awesome-react-native,vitalets/react-native-extended-stylesheet,529,react-native-extended-stylesheet,awesome-react-native,Styling,"awesome-react-native, Styling"
9,the last carousel you'll ever need,kenwheeler/slick,JavaScript,kenwheeler.github.io/slick,False,"Sliders, slick",https://github.com/kenwheeler/slick,sorrycc/awesome-javascript,kenwheeler/slick,451,slick,awesome-javascript,Sliders,"awesome-javascript, Sliders"


In [7]:
X = df["language"] + " " + df["description"]
y = df["categories2"]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [9]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
])


In [10]:
text_clf = text_clf.fit(X_train, y_train)

In [11]:
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)

0.068195908245505268

In [12]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, n_iter=20, random_state=42)),
])
_ = text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)

0.34717916924984499

In [13]:
from sklearn import metrics
print(metrics.classification_report(X_test, predicted))

             precision    recall  f1-score   support

 :arrow_up:       0.00      0.00      0.00         0
:arrow_up:, Async       0.00      0.00      0.00         0
:arrow_up:, Collections       0.00      0.00      0.00         0
:arrow_up:, Combinators       0.00      0.00      0.00         0
:arrow_up:, Coroutines       0.00      0.00      0.00         0
:arrow_up:, Declarative Programming       0.00      0.00      0.00         0
:arrow_up:, Dependent Types       0.00      0.00      0.00         0
:arrow_up:, Folds and Lazy Lists       0.00      0.00      0.00         0
:arrow_up:, Graphs       0.00      0.00      0.00         0
:arrow_up:, LLVM Compiler       0.00      0.00      0.00         0
:arrow_up:, Math       0.00      0.00      0.00         0
:arrow_up:, Performance       0.00      0.00      0.00         0
:arrow_up:, RAC / ReactiveCocoa       0.00      0.00      0.00         0
:arrow_up:, RxSwift       0.00      0.00      0.00         0
:arrow_up:, Sets       0.00      0.0

In [14]:
predicted

array([u'Science and Data Analysis',
       u'Miscellaneous Repos, Community Repos:',
       u'Cryptography, cryptography', ...,
       u'Machine Learning, awesome-machine-learning',
       u'FFI, The Rust FFI Omnibus', u'WSTagsField'], 
      dtype='<U89')