# MICROSOFT

### Problem statement:
We want to explore microfot stock since inception and make a determination on the stock price.
This is a regression problem as we are trying to predict the price of the stock.


#### We will cover topics such as:
1. Data ingestion
1. Data exploration
3. Data clasfication analysis
4. Splitting into testing and training data
5. Data modeling 
6. Model training
7. Model application
8. Model evaluation
9. Plot of graph




### import all libraries

In [2]:
import pandas as pd
import numpy as np

In [4]:
data = pd.read_csv("Microsoft Dataset.csv")

In [8]:
data.head(10)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1986-03-13,0.088542,0.101563,0.088542,0.097222,0.060055,1031788800
1,1986-03-14,0.097222,0.102431,0.097222,0.100694,0.062199,308160000
2,1986-03-17,0.100694,0.103299,0.100694,0.102431,0.063272,133171200
3,1986-03-18,0.102431,0.103299,0.098958,0.099826,0.061663,67766400
4,1986-03-19,0.099826,0.100694,0.097222,0.09809,0.060591,47894400
5,1986-03-20,0.09809,0.09809,0.094618,0.095486,0.058982,58435200
6,1986-03-21,0.095486,0.097222,0.091146,0.092882,0.057374,59990400
7,1986-03-24,0.092882,0.092882,0.08941,0.090278,0.055765,65289600
8,1986-03-25,0.090278,0.092014,0.08941,0.092014,0.056838,32083200
9,1986-03-26,0.092014,0.095486,0.091146,0.094618,0.058446,22752000


In [10]:
data.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,9632.0,9632.0,9632.0,9632.0,9632.0,9632.0
mean,56.808047,57.396715,56.203807,56.820997,51.194432,56932540.0
std,86.835233,87.652713,85.966316,86.849717,86.831026,38140730.0
min,0.088542,0.092014,0.088542,0.090278,0.055765,2304000.0
25%,5.669922,5.732422,5.578125,5.65625,3.493896,32265750.0
50%,27.280001,27.540001,27.040001,27.298437,19.14719,50058700.0
75%,46.37625,46.8925,45.845312,46.360001,37.789997,70908600.0
max,432.970001,433.600006,427.160004,430.519989,430.519989,1031789000.0


In [13]:
data.shape

(9632, 7)

In [24]:
# to check if there are any null values in the dataset
total = data.isnull().sum().sort_values(ascending=True)
percent = (total/len(data))*100
pd.concat([total, percent], axis=1, keys=['Total', 'Percent']) .transpose()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
Total,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Percent,0.0,0.0,0.0,0.0,0.0,0.0,0.0


there is no null values in the dataset

In [25]:
#CHICKING FOR UNIQUE VALUES --- everything is uniques
data.nunique()

Date         9632
Open         5751
High         5685
Low          5707
Close        5899
Adj Close    7530
Volume       9440
dtype: int64

### SPLIT INTO TARGET AND FEATURES

In [65]:
# Since our target is Volume, lets split it ---- we would have used one hot encoding if there are none numerical values in our dataset
y = data["Volume"]
X = pd.get_dummies(data.drop("Volume", axis=1),drop_first=True)

In [66]:
#lets view our x and y --- in shape
y.shape

(9632,)

### Lets split our data into traing and testing data

In [67]:
# import the library from sklearn
from sklearn.model_selection import train_test_split

#split our data

X_train, X_test = train_test_split(X, test_size = 0.20, random_state= 60)
y_train, y_test = train_test_split(y, test_size = 0.20, random_state= 60)

X_train.shape

(7705, 9636)

In [68]:
y_test.shape

(1927,)

### Train our model

In [69]:
#we import the library from sklearn
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
#fit your model into the regression
lr.fit(X_train, y_train)

### Apply our model

In [72]:
# Lets make our prediction to see our performance
y_lr_train_pred = lr.predict(X_train)
y_lr_test_pred = lr.predict(X_test)

In [77]:
y_lr_train_pred

array([40557500.00001526, 38686099.99937642, 34079700.00013268, ...,
       28703999.99996436, 46763199.999964  , 49150399.9999367 ])

In [78]:
y_lr_test_pred

array([ 7.18141107e+07,  3.69141820e+08,  1.92902160e+08, ...,
       -3.64822811e+08, -7.07360410e+08, -1.61656549e+08])

### Model Evaluation

In [112]:
from sklearn.metrics import mean_squared_error as MSE, r2_score, mean_absolute_error

lr_r2_train = r2_score(y_train, y_lr_train_pred)
lr_MSE_train = MSE(y_train, y_lr_train_pred)

lr_r2_test = r2_score(y_test, y_lr_test_pred)
lr_MSE_test = MSE(y_test, y_lr_test_pred)

# lets know the and the mean squared error, r2 score

print(lr_r2_train)
print(lr_MSE_train)
print(lr_r2_test)
print(lr_MSE_test)



1.0
4.713871274717035e-10
-194.63250136716456
2.928481007181496e+17


In [114]:
#make it into a dataframe
df = pd.DataFrame(["LiinearRegression",lr_r2_train,lr_MSE_train, lr_r2_test, lr_MSE_test]).transpose()
df

Unnamed: 0,0,1,2,3,4
0,LiinearRegression,1.0,0.0,-194.632501,2.928481007181496e+17


In [115]:
df.columns = ["MODEL","TRAIN R2", "TRAIN MSE","TEST R2", "TEST MSE"]
df

Unnamed: 0,MODEL,TRAIN R2,TRAIN MSE,TEST R2,TEST MSE
0,LiinearRegression,1.0,0.0,-194.632501,2.928481007181496e+17


## Random Forest