In [23]:
import numpy as np
import csv
import matplotlib.pyplot as plt
import tqdm
import math
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error

In [47]:
poly = PolynomialFeatures(2)
scaler = StandardScaler()

In [25]:
def form_data():
    filename = 'all_standups_jun17_jun18.csv'
    user_id = np.zeros((15005, 1))
    interest_id = np.zeros((15005, 1))
    project_id = np.zeros((15005, 1))
    time = np.zeros((15005, 1))
    index = 0;
    with open(filename) as f:
        reader = csv.DictReader(f)
        for row in reader:
            user_id[index, :] = row['user_id']
            interest_id[index, :] = row['interest_id']
            project_id[index, :] = row['project_id']
            time[index, :] = row['time']
            index += 1
    
    n = time.shape[0]
    data = np.zeros((n, 3))    
    data[:, 0] = user_id[:,0]
    data[:, 1] = interest_id[:,0]
    data[:, 2] = project_id[:,0]
    
    data = poly_features(data)
    data = normalization(data)
    print(data.shape)
    data = np.append(data, time, axis=1)
    np.random.shuffle(data)
    print(data.shape)
    return data

In [26]:
def normalization(data):
    scaler.fit(data)
    trans_data = scaler.transform(data)
    return trans_data

In [27]:
def poly_features(data):
    dataTransform = poly.fit_transform(data)
    return dataTransform 

In [28]:
def form_test_train(data, n):
    test_size = math.floor(n * 0.1)
    test = np.zeros((test_size, data.shape[1]))
    train = np.zeros((n - test_size, data.shape[1]))
    test = data[0:test_size, :]
    train = data[test_size:, :]
    return test, train

In [29]:
def linear_regression_model(train, test):
    model = LinearRegression(normalize = True)
    label_length = train.shape[1] - 1
    x = train[:, 0:label_length]
    y = train[:, label_length]  
    model.fit(x, y)
    #for i in range(test.shape[0]):
        #print(test[i, label_length].reshape(-1, 1) - model.predict(test[i, 0:label_length]).reshape(-1, 1))
    error = np.sum(mean_squared_error(test[:, label_length], model.predict(test[:, 0:label_length]))) / test.shape[0]
    return error, model

In [30]:
def reidge_regression_model(train, test):
    alpha = np.arange(0.05, 1.0, 0.05)
    label_length = train.shape[1] - 1
    x = train[:, 0:label_length]
    y = train[:, label_length]  
    errors = np.zeros(alpha.shape)
    index = 0
    lowest_error = 10000
    best_model = None
    for al in alpha:
        model = Ridge(al, max_iter = 50000)
        model.fit(x, y)
        errors[index] = np.sum(mean_squared_error(test[:, label_length], model.predict(test[:, 0:label_length]))) / test.shape[0]
        if(errors[index] < lowest_error):
            best_model = model
            lowest_error = errors[index]        
        index += 1
    return lowest_error, best_model

In [31]:
def plot_data(data, title):
    ax = plt.subplot(111, projection='3d')
    x, y, z = data[:,0], data[:,1], data[:,2]    
    ax.scatter(x, y, z, c='r')  
    ax.text2D(0.05, 0.95, title, transform=ax.transAxes)
    ax.set_zlabel('Z')  
    ax.set_ylabel('Y')
    ax.set_xlabel('X')
    plt.show()

In [32]:
def single_test(data, model):
    print('test normalized data: ' ,data.shape)
    return model.predict(data)

In [33]:
def main():
    data = form_data()    
    test, train = form_test_train(data, data.shape[0])
    linear_error, linear_model = linear_regression_model(train, test)    
    print('Linear Average Error: ', linear_error)
    redige_errors, reidge_model = reidge_regression_model(train, test)
    print('Redige Average Error: ', redige_errors)
    
    x = poly.transform(np.array([6, 23, 74]).reshape(1, -1).astype(float))
    x = scaler.transform(x)
    #print(x)
    #print(test[0,:])
    print(single_test(x, linear_model))
    

In [48]:
if __name__ == "__main__":
    main()

(15005, 10)
(15005, 11)
Linear Average Error:  12.867950040599188
Redige Average Error:  12.867631118231708
test normalized data:  (1, 10)
[140.14049857]
