In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Open the CSV file with delimiter ';'
df = pd.read_csv('greenchips.csv', delimiter=';')

# Remove NAN columns (not the whole row)
df = df.dropna(axis=1)
df.head()

# Extract and convert the 'PRICE' and 'SALES' columns to the correct format
price = df['PRICE'].str.replace(',', '.').astype(float)
sales = df['SALES'].str.replace(',', '.').astype(float)

# Update the dataframe with the converted values
df['PRICE'] = price
df['SALES'] = sales

# Add a constant (intercept) to the 'PRICE' data for the regression model
X = sm.add_constant(price)

# Fit the OLS regression model: sales = a + price * b
model = sm.OLS(sales, X).fit()

# Print the slope (coefficient) and intercept of the model
print(model.summary())

# Correlation matrix for the dataframe
print('Correlation matrix:')
print(df.corr(), end='\n\n')

# R^2 value for the model
r2 = model.rsquared
print('R^2:', r2)


                            OLS Regression Results                            
Dep. Variable:                  SALES   R-squared:                       0.776
Model:                            OLS   Adj. R-squared:                  0.774
Method:                 Least Squares   F-statistic:                     353.7
Date:                Tue, 22 Oct 2024   Prob (F-statistic):           6.28e-35
Time:                        17:17:51   Log-Likelihood:                -510.34
No. Observations:                 104   AIC:                             1025.
Df Residuals:                     102   BIC:                             1030.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        626.6142     26.359     23.772      0.0