In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# convert display of scientific compute to float
pd.options.display.float_format = '{:.2f}'.format

# Read in the data
data = pd.read_csv('../notebooks/data/merged_inputed_data.csv')

In [2]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

X, y = data.drop('CNT', axis=1), data['CNT']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_withconstant = sm.add_constant(X_train)

# instantiate model
model = sm.OLS(y_train, X_withconstant)

# fit model
results = model.fit()

# predictions
y_preds = results.predict(X_withconstant)

# r2 score
print(f'R-2 Score: {results.rsquared}')


R-2 Score: 0.920700553806398


In [None]:
print(results.summary())

In [None]:
results_summary = results.summary()
results_as_html = results_summary.tables[1].as_html()
results = pd.read_html(results_as_html, header=0, index_col=0)[0]

In [None]:
# sort results where p<|t| is between 0.05 and 0.1
results[results['P>|t|'].between(0.05, 0.1)].sort_values(by='coef', ascending=False)


In [5]:
y_pred = results.predict(X_test)

fig = sns.regplot(x=y_pred, y=y_test, color='red', marker="^", scatter_kws={"s": 100})
fig.set(title="Linear Regression Model", xlabel="Predicted Consecutive Negative Tests", ylabel="Actual Consecutive Negative Tests")
fig.figure.set_size_inches(10, 5)
plt.show(fig)


ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 978 and the array at index 1 has size 327

In [None]:
from scipy import stats

model_resids = results.resid

# Calculate residuals
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Plot histogram of residuals
ax1.hist(model_resids, bins=20)
ax1.set_title('Model Residuals')
ax1.set_xlabel('Residuals')
ax1.set_ylabel('Frequency')

# Plot QQ plot of residuals
stats.probplot(model_resids, dist="norm", plot=ax2)
ax2.set_title('QQ Plot of Residuals')

plt.show()
