In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
# import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


# Configure visualisations
# %matplotlib inline
# color = sns.color_palette()
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
# mpl.style.use( 'ggplot' )
# sns.set_style( 'whitegrid' )
pylab.rcParams[ 'figure.figsize' ] = 8,6
  
data = pd.read_csv('/input/finaldata.csv')

newDf = data.select_dtypes(include=[np.number])
cols  = newDf.columns
zero_rows = []
zero_per = []
for i in cols:
    zero_rows.append(newDf[newDf[i] == 0].shape[0])
    zero_per.append(newDf[newDf[i] == 0].shape[0] * 100 / float(data.shape[0]))

df = pd.DataFrame({'col':cols,'numberRows':zero_rows,'zero_per':zero_per})

"""# changing the time to their hours representation"""

time_columns = ['createdAt','updatedAt','pushedAt']
for i in time_columns:
    data[i] =  data[i].apply(lambda x : x.replace('T',' ').replace('Z',''))


for i in time_columns:
    data[i] = data[i].apply(lambda x: int(datetime.strptime(x,'%Y-%m-%d %H:%M:%S').strftime('%s')) / (60 * 60))

"""# website url to binary form 1(yes) / 0(no)"""

data['websiteUrl'] = data['websiteUrl'].fillna('')

data['websiteUrl'] = data['websiteUrl'].apply(lambda x : 1 if len(x) > 0 else 0)

data['description'] = data['description'].fillna('')

"""# **description Word and character count**"""

data['desWordCount'] = data['description'].apply(lambda x: len(x.split(' ')))
data['desCharCount'] = data['description'].apply(len)

"""# hasWikiEnabled to binary form"""

data['hasWikiEnabled'] = data['hasWikiEnabled'].apply(lambda x : 1 if x else 0)

"""# license to one hot encoding"""

data['license'] = data['license'].fillna('')

license_cols = ['mit_license','nan_license','apache_license','other_license','remain_license']

for i in license_cols:
    
    if i.startswith('mit'):
        data[i] = data['license'].apply(lambda x: 1 if x == 'MIT License' else 0)
    elif i.startswith('nan'):
        data[i] = data['license'].apply(lambda x: int(len(x) == 0))
    elif i.startswith('apache'):
        data[i] = data['license'].apply(lambda x: 1 if x == 'Apache License 2.0' else 0)
    elif i.startswith('other'):
        data[i] = data['license'].apply(lambda x: 1 if x == 'Other' else 0)

data['remain_license'] = (data[license_cols[:-1]].sum(axis=1) == 0).astype(int)

"""# primary language to one hot"""


lang_cols = np.array(data.primaryLanguage.value_counts()[:6].index)

data[lang_cols[0]] = data.primaryLanguage.apply(lambda x : int(x == 'JavaScript'))                  
data[lang_cols[1]] = data.primaryLanguage.apply(lambda x : int(x == 'Python'))                  
data[lang_cols[2]] = data.primaryLanguage.apply(lambda x : int(x == 'Java'))                  
data[lang_cols[3]] = data.primaryLanguage.apply(lambda x : int(x == 'Objective-C'))                  
data[lang_cols[4]] = data.primaryLanguage.apply(lambda x : int(x == 'Ruby'))                  
data[lang_cols[5]] = data.primaryLanguage.apply(lambda x : int(x == 'PHP'))
data['other_language'] = (data[lang_cols].sum(axis=1) == 0).astype(int)

"""# type to binary 1(user) / 0(org)"""

data['type'] = data.type.apply(lambda x : 1 if x == 'user' else 0)

"""# setting nan members to 0"""

data['members'] = data['members'].fillna(0)

"""# changing NAN to 0 for the following"""

data['organizations'] = data['organizations'].fillna(0)
data['gists'] = data['gists'].fillna(0)
data['gistStar'] = data['gistStar'].fillna(0)
data['gistComments'] = data['gistComments'].fillna(0)
data['following'] = data['following'].fillna(0)
data['followers'] = data['followers'].fillna(0)

"""# removing the null numCommits(of master branch) rows"""


data = data[data.commits.notnull()]

"""# removing the null repo name rows"""

data = data[data.reponame.notnull()]

## Columns to drop
df = data
col = ['description' , 'isArchived' , 'license' ,'location' , 'login' , 'primaryLanguage' , 'reponame' , 'siteAdmin']
data = data.drop(col , axis = 1)

X = data.drop(['stars'] , axis =1)
Y = data.stars

# s = StandardScaler()
# X = s.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.10, random_state=42)

"""# random forest"""

# from sklearn.ensemble import RandomForestRegressor
# model = RandomForestRegressor(n_jobs=-1)
# # Try different numbers of n_estimators - this will take a minute or so
# estimators = np.arange(10, 200, 10)
# scores = []
# for n in estimators:
#     model.set_params(n_estimators=n)
#     model.fit(X_train, y_train)
#     scores.append(model.score(X_test, y_test))
# plt.title("Effect of n_estimators")
# plt.xlabel("n_estimator")
# plt.ylabel("score")
# plt.plot(estimators, scores)

# from sklearn.ensemble import GradientBoostingRegressor
# reg = GradientBoostingRegressor(verbose = 1)
# reg.fit(X_train , y_train)
# print(reg.score(X_train, y_train))
# print(reg.score(X_test, y_test))



'# random forest'

In [4]:
from sklearn.ensemble import GradientBoostingRegressor
reg = GradientBoostingRegressor(verbose = 1)
reg.fit(X_train , y_train)
print(reg.score(X_train, y_train))
print(reg.score(X_test, y_test))

      Iter       Train Loss   Remaining Time 
         1     2360314.1265           27.52s
         2     2074469.1673           26.73s
         3     1836205.1469           26.57s
         4     1638331.9862           26.16s
         5     1467911.6259           25.84s
         6     1329086.0858           25.45s
         7     1208981.9030           25.48s
         8     1113993.2064           25.17s
         9     1029045.8079           24.85s
        10      958567.9627           24.59s
        20      602582.8212           22.13s
        30      498356.7437           19.50s
        40      452840.1770           16.92s
        50      421269.7745           14.27s
        60      397800.7554           11.49s
        70      381910.7280            8.67s
        80      370756.3901            5.76s
        90      358842.4047            2.88s
       100      350172.1649            0.00s
0.870595381319
0.784716842866


In [5]:
from sklearn.linear_model import Lasso
lg = Lasso(alpha = 0.01)
lg.fit(X_train, y_train)
print(lg.score(X_train, y_train))
print(lg.score(X_test, y_test))

0.637669673837
0.603356606944


In [7]:
from sklearn.linear_model import Ridge
lg = Ridge(alpha = 0.001)
lg.fit(X_train, y_train)
print(lg.score(X_train, y_train))
print(lg.score(X_test, y_test))

0.637669729405
0.603297763456


In [8]:
import xgboost

In [9]:
xgb = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)

In [10]:
xgb.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.08, max_delta_step=0,
       max_depth=7, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.75)

In [12]:
print(xgb.score(X_train, y_train))
print(xgb.score(X_test, y_test))

0.942744905903
0.820112348943


In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_jobs=-1)
# Try different numbers of n_estimators - this will take a minute or so
model.set_params(n_estimators=100)
model.fit(X_train, y_train)
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

0.974180918503
0.793829574743


In [2]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

Using TensorFlow backend.


In [7]:
from sklearn.metrics import accuracy_score,mean_squared_error
seed = 1
np.random.seed(seed)

In [9]:
def baseline_model():
    model = Sequential()
    model.add(Dense(100, input_dim=54, activation='relu', kernel_initializer='normal'))
    model.add(Dropout(0.2))
    model.add(Dense(50, activation='relu', kernel_initializer='normal'))
    model.add(Dropout(0.2))
    model.add(Dense(25, activation='relu', kernel_initializer='normal'))
    model.add(Dropout(0.2))
    model.add(Dense(1, kernel_initializer='normal'))
    
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

estimator = KerasRegressor(build_fn=baseline_model, nb_epoch=200, batch_size=32, verbose=True)
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(estimator, X_train.values, y_train.values, cv=kfold)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))

# estimator.fit(X_train, y_train)
# prediction = estimator.predict(X_test)
# prediction_train = estimator.predict(X_train)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

KeyboardInterrupt: 

In [11]:
from sklearn.metrics import r2_score

In [11]:
import tensorflow as tf
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

In [12]:
r2_score(y_test, prediction)

0.81582181022225742

In [None]:
r2_score(y_train, prediction_train)

In [13]:
print(mean_squared_error(y_test, prediction) ** 0.5)

672.018502203
