In [1]:
from scipy import stats as ss

import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


In [2]:
def derivadas_regressao(theta, X, y):
    return -2 * ((y - X @ theta) * X.T).mean(axis=1)

In [3]:
def gd(theta, d_fun, X, y, lambda_=0.005, tol=0.000001, max_iter=1000000):
    theta = theta.copy()
    #print('Iter {}; theta = '.format(0), theta)
    old_err_sq = np.inf
    i = 0
    while True:
        # Computar as derivadas
        grad = d_fun(theta, X, y)
        # Atualizar
        theta_novo = theta - lambda_ * grad
        
        # Parar quando o erro convergir
        err_sq = ((X.dot(theta) - y) ** 2).mean()
        if np.abs(old_err_sq - err_sq) <= tol:
            break
        theta = theta_novo
        old_err_sq = err_sq
        #print('Iter {}; theta = '.format(i+1), theta)
        i += 1
        if i == max_iter:
            break
    return theta

In [4]:
df = pd.read_csv('./train.csv', index_col=0)

X = df[['NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC', 'NU_NOTA_MT', 'NU_NOTA_COMP1', 'NU_NOTA_COMP2', 'NU_NOTA_COMP3', 'NU_NOTA_COMP4', 'NU_NOTA_COMP5', 'NU_NOTA_REDACAO']].dropna(how='all').fillna(df.mean())


In [5]:
X = X - X.mean()
X = X / X.std(ddof=1)

y = X['NU_NOTA_MT']
X = X[['NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC', 'NU_NOTA_COMP1', 'NU_NOTA_COMP2', 'NU_NOTA_COMP3', 'NU_NOTA_COMP4', 'NU_NOTA_COMP5', 'NU_NOTA_REDACAO']]

X['intercepto'] = 1

X = X[['intercepto', 'NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC', 'NU_NOTA_COMP1', 'NU_NOTA_COMP2', 'NU_NOTA_COMP3', 'NU_NOTA_COMP4', 'NU_NOTA_COMP5', 'NU_NOTA_REDACAO']]

X = X.values
y = y.values

theta = np.ones(10)
theta = gd(theta, derivadas_regressao, X, y)
theta



array([ 0.00078349,  0.36836257,  0.15866564,  0.16872674,  0.01866017,
        0.01979052,  0.01679227,  0.04017013,  0.05854897, -0.09403852])

In [6]:
def sst(y):
    # YOUR CODE HERE
    return ((y - y.mean()) ** 2).sum()

def predict(X, theta):
    # YOUR CODE HERE
    
    Y = X @ theta
    
    return Y

def sse(X, y, theta):
    # YOUR CODE HERE
        
    return ((y - predict(X, theta))**2).sum()

def r2(X, y, theta):
    # YOUR CODE HERE
    
    return 1.0 - (sse(X, y, theta) / sst(y))

In [7]:
from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept=False)
model.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None, normalize=False)

In [8]:
df = pd.read_csv('./test.csv')
X = df[['NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC', 'NU_NOTA_COMP1', 'NU_NOTA_COMP2', 'NU_NOTA_COMP3', 'NU_NOTA_COMP4', 'NU_NOTA_COMP5', 'NU_NOTA_REDACAO']].fillna(df.mean())
y = df['NU_INSCRICAO']

treta = df['TP_PRESENCA_LC']

X['intercepto'] = 1
X = X[['intercepto', 'NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC', 'NU_NOTA_COMP1', 'NU_NOTA_COMP2', 'NU_NOTA_COMP3', 'NU_NOTA_COMP4', 'NU_NOTA_COMP5', 'NU_NOTA_REDACAO']]

treta.head()
#TP_PRESENCA_LC

0    1
1    1
2    1
3    0
4    1
Name: TP_PRESENCA_LC, dtype: int64

In [11]:

data = {'TP_PRESENCA_LC': treta,
        'NU_INSCRICAO':  y,
        'NU_NOTA_MT': X@theta}

answer = pd.DataFrame (data)

answer.loc[answer['TP_PRESENCA_LC'] == 0, 'NU_NOTA_MT'] = 0.0

answer.loc[answer['TP_PRESENCA_LC'] == 2, 'NU_NOTA_MT'] = 0.0

answer = answer[['NU_INSCRICAO', 'NU_NOTA_MT']]

answer.to_csv('answer.csv', index=False)


answer.head()

Unnamed: 0,NU_INSCRICAO,NU_NOTA_MT
0,73ff9fcc02f0a99919906c942c2e1a1042cdcf98,286.468763
1,71a95f9f1b91a82c65ad94abbdf9f54e6066f968,277.212463
2,b38a03232f43b11c9d0788abaf060f7366053b6d,402.601191
3,70b682d9a3636be23f6120fa9d6b164eb3c6002d,0.0
4,715494628a50142ce8cb17191cfe6d0f3cae0934,370.550186
