In [1]:
import pandas as pd
import statsmodels.api as sm

In [2]:
mat = pd.read_csv("student-mat.csv")
port = pd.read_csv("student-por.csv")

In [3]:
# Misturando os datasets para obter todos os alunos
data = pd.concat([mat, port]).drop_duplicates().reset_index(drop=True)

In [4]:
# Colunas do dataset
data.columns

Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object')

In [5]:
# Removendo colunas que não serão utilizadas
data = data.drop(
    ["school", "address", "famsize", 
     "Pstatus", "Medu", "Fedu", 
     "Mjob", "Fjob", "reason", 
     "guardian", "traveltime", "studytime", 
     "failures", "schoolsup", "famsup", 
     "paid", "activities", "nursery", 
     "higher", "internet", "romantic", 
     "freetime", "goout", "health", "absences", 
     "G1", "G2", "G3"
    ], 
    axis=1)

In [6]:
# Trocando valores binários de genero para 0/1
data.sex.replace(['M', 'F'], [0, 1], inplace=True)

In [7]:
# Criando a coluna para o consumo de álcool
data["alcool"] = (data["Dalc"] * 5 + data["Walc"] * 2)/7

In [8]:
# Colunas originais
data.columns

Index(['sex', 'age', 'famrel', 'Dalc', 'Walc', 'alcool'], dtype='object')

In [9]:
# Removendo as colunas Dalc e Walc, pois já foi feito o cálculo, e renomeando as colunas
prettydata = data.drop(["Dalc", "Walc"], axis=1)

In [10]:
prettydata.columns = ["gen", "age", "q_rel", "alcool"]

In [11]:
prettydata

Unnamed: 0,gen,age,q_rel,alcool
0,1,18,4,1.000000
1,1,17,5,1.000000
2,1,15,4,2.285714
3,1,15,3,1.000000
4,1,16,4,1.285714
...,...,...,...,...
1039,1,19,5,1.285714
1040,1,18,4,1.000000
1041,1,18,1,1.000000
1042,0,17,2,3.285714


In [12]:
X = prettydata[["gen", "age", "q_rel"]]
Y = prettydata["alcool"]

In [13]:
model = sm.OLS(Y, X).fit()
predictions = model.predict(X) 

model.summary()

0,1,2,3
Dep. Variable:,alcool,R-squared (uncentered):,0.805
Model:,OLS,Adj. R-squared (uncentered):,0.804
Method:,Least Squares,F-statistic:,1428.0
Date:,"Wed, 05 May 2021",Prob (F-statistic):,0.0
Time:,08:25:43,Log-Likelihood:,-1328.5
No. Observations:,1044,AIC:,2663.0
Df Residuals:,1041,BIC:,2678.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
gen,-0.6071,0.054,-11.224,0.000,-0.713,-0.501
age,0.1473,0.007,21.116,0.000,0.134,0.161
q_rel,-0.1026,0.027,-3.732,0.000,-0.157,-0.049

0,1,2,3
Omnibus:,252.165,Durbin-Watson:,1.895
Prob(Omnibus):,0.0,Jarque-Bera (JB):,517.682
Skew:,1.375,Prob(JB):,3.86e-113
Kurtosis:,5.082,Cond. No.,34.9


In [15]:
model.pvalues["q_rel"]

0.00020034216714401721