In [3]:
import pandas as pd
import numpy as np

In [4]:
returns_df=pd.read_excel('Stock_returns.xlsx')

In [None]:
# pd.read_csv

In [None]:
returns_df.head(5)

In [None]:
returns_df.info()

In [None]:
returns_df['Apple'].describe()

In [None]:
returns_df.describe()

In [None]:
returns_df.columns

# Accessing Subsets of Data

In [None]:
returns_df.loc[0:3]  # for loc, the second index in the slice is inclusive

In [None]:
returns_df.iloc[0:4] # for iloc, the second index in the slice is exclusive

In [None]:
returns_df['Amazon'].iloc[0:10]

In [None]:
returns_df.iloc[0:10]['Amazon']

In [None]:
returns_df.iloc[0:10].Amazon

In [None]:
returns_df.iloc[4:10,0:3]

In [None]:
pd.concat([returns_df.iloc[4:6,0:2],returns_df.iloc[4:6,4:5]],axis=1)


In [None]:
returns_df['Msft'][20:30]

In [None]:
print('Number of rows', len(returns_df['Apple']))

In [None]:
print('Mean of Apple',returns_df['Apple'].mean().round(3))

In [None]:
returns_df.sample(10)

In [None]:
returns_df.dtypes

# A Different Data Set: 2023_Roxbury


In [None]:
house_df=pd.read_excel('2023_Roxbury.xlsx')

#This data is a slightly cleaned and organized version of the Boston house prices data, which can be found here: https://data.boston.gov/dataset/property-assessment

In [None]:
house_df.head()

In [None]:
house_df.columns

In [None]:
house_df.describe()

### MISSING VALUES

In [None]:
reduced_df=house_df.dropna()

In [None]:
print('Number of rows after removing rows with missing values: ',
     len(reduced_df))

In [None]:
medianBedrooms=house_df['BED_RMS'].median()
house_df.BED_RMS=house_df.BED_RMS.fillna(value=medianBedrooms)

In [None]:
print('Number of rows after filling the missing values with median is: ',
     len(house_df['BED_RMS']))

### Normalizing the Data

In [None]:
house_df.describe()

In [None]:
house_df_v2=house_df.dropna()

In [None]:
house_df_v2.drop(['LU','HEAT_TYPE','PROP_VIEW'], axis=1, inplace=True)


In [None]:
normalized_house_df=(house_df_v2-house_df_v2.mean())/house_df_v2.std()

In [None]:
normalized_house_df.describe()

# Regression

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.regplot(x='Spx',y='Apple', data=returns_df)
plt.xlabel('SP500 Returns')
plt.ylabel('Apple Returns')
plt.title ('Relationship between SP500 and Apple')

# (OPTIONAL) Machine Learning Approach

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [5]:
excludeColumns=('Amazon','Date','Msft','Ibm','Apple')
predictors=[s for s in returns_df.columns if s not in excludeColumns]
outcome='Apple'

X=returns_df[predictors]
y=returns_df[outcome]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=101)

In [7]:
model=LinearRegression()
model.fit(X_train,y_train)
y_pred=model.predict(X_train)

In [8]:
train_results = pd.DataFrame({
    'Apple': y_train,
    'predicted': y_pred,
    'residual': y_train - y_pred
})
print(train_results.head())

         Apple  predicted  residual
1884  0.019234   0.005594  0.013639
360  -0.007588  -0.005349 -0.002239
1814  0.007712   0.004771  0.002941
1649 -0.016099  -0.033253  0.017153
412   0.001231   0.003065 -0.001834


In [9]:
y_train.head()

1884    0.019234
360    -0.007588
1814    0.007712
1649   -0.016099
412     0.001231
Name: Apple, dtype: float64

In [10]:
X_train.head()

Unnamed: 0,Spx
1884,0.004221
360,-0.004873
1814,0.003537
1649,-0.028059
412,0.002119


In [11]:
print('Intercept:',model.intercept_)
print('Slope:',model.coef_)

Intercept: 0.000515011427237597
Slope: [1.20344529]


In [12]:
valid_pred = model.predict(X_test)
valid_results = pd.DataFrame({
    'Apple': y_test,
    'predicted': valid_pred,
    'residual': y_test - valid_pred
})
print(valid_results.head())

         Apple  predicted  residual
994  -0.009779   0.001295 -0.011074
2406 -0.009913  -0.008172 -0.001741
321   0.026016   0.018891  0.007124
1612  0.072022   0.059961  0.012061
212   0.004923   0.000441  0.004482


In [13]:
from sklearn.metrics import r2_score
print('Training set r2: ', r2_score(train_results.Apple, train_results.predicted))
print('Validation set r2: ', r2_score(valid_results.Apple, valid_results.predicted))

Training set r2:  0.5608262941008804
Validation set r2:  0.5212017769522648


# Mean Squared Error

In [15]:
# Evaluate the model's performance
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, valid_pred)
print("Mean Squared Error: ", mse)

Mean Squared Error:  0.00014305699435155047


In [16]:
# This code is prepared by Orhan Erdem