In [1]:
#Import libraries
import pandas as pd
import pickle as pk
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
#Load the dataset
filepath  = '/content/startup.csv'
data = pd.read_csv(filepath)
print(data.head(10))

   R&D Spend  Administration  Marketing Spend       State     Profit
0  165349.20       136897.80        471784.10    New York  192261.83
1  162597.70       151377.59        443898.53  California  191792.06
2  153441.51       101145.55        407934.54     Florida  191050.39
3  144372.41       118671.85        383199.62    New York  182901.99
4  142107.34        91391.77        366168.42     Florida  166187.94
5  131876.90        99814.71        362861.36    New York  156991.12
6  134615.46       147198.87        127716.82  California  156122.51
7  130298.13       145530.06        323876.68     Florida  155752.60
8  120542.52       148718.95        311613.29    New York  152211.77
9  123334.88       108679.17        304981.62  California  149759.96


In [3]:
#Select variable
data_copy = data.copy()
data_copy.columns = ['Research_And_Development', 'Administration', 'Marketing_Spend', 'State', 'Profit']
print(data_copy.head(10))

   Research_And_Development  Administration  Marketing_Spend       State  \
0                 165349.20       136897.80        471784.10    New York   
1                 162597.70       151377.59        443898.53  California   
2                 153441.51       101145.55        407934.54     Florida   
3                 144372.41       118671.85        383199.62    New York   
4                 142107.34        91391.77        366168.42     Florida   
5                 131876.90        99814.71        362861.36    New York   
6                 134615.46       147198.87        127716.82  California   
7                 130298.13       145530.06        323876.68     Florida   
8                 120542.52       148718.95        311613.29    New York   
9                 123334.88       108679.17        304981.62  California   

      Profit  
0  192261.83  
1  191792.06  
2  191050.39  
3  182901.99  
4  166187.94  
5  156991.12  
6  156122.51  
7  155752.60  
8  152211.77  
9  149759.96 

In [4]:
#Data cleaning (preprocessed)
y = data_copy ['Profit']
x = data_copy.drop(columns =['Profit'])
categorical_features = ['State']
one_hot = OneHotEncoder(sparse_output=False, drop='first')
x_encoded = one_hot.fit_transform(x[categorical_features])
encoded_columns = one_hot.get_feature_names_out(categorical_features)
x_encoded_df = pd.DataFrame(x_encoded, columns=encoded_columns)
x_numeric = x.drop(columns = categorical_features)

In [5]:
x_preprocessed = pd.concat([x_encoded_df, x_numeric.reset_index(drop =True)], axis=1)

x_preprocessed.columns = x_preprocessed.columns.astype(str)
scaler  = StandardScaler()
x_scaled = scaler.fit_transform(x_preprocessed)
print(x_scaled)

[[-6.85994341e-01  1.39326109e+00  2.01641149e+00  5.60752915e-01
   2.15394309e+00]
 [-6.85994341e-01 -7.17740563e-01  1.95586034e+00  1.08280658e+00
   1.92360040e+00]
 [ 1.45773797e+00 -7.17740563e-01  1.75436374e+00 -7.28257028e-01
   1.62652767e+00]
 [-6.85994341e-01  1.39326109e+00  1.55478369e+00 -9.63646307e-02
   1.42221024e+00]
 [ 1.45773797e+00 -7.17740563e-01  1.50493720e+00 -1.07991935e+00
   1.28152771e+00]
 [-6.85994341e-01  1.39326109e+00  1.27980001e+00 -7.76239071e-01
   1.25421046e+00]
 [-6.85994341e-01 -7.17740563e-01  1.34006641e+00  9.32147208e-01
  -6.88149930e-01]
 [ 1.45773797e+00 -7.17740563e-01  1.24505666e+00  8.71980011e-01
   9.32185978e-01]
 [-6.85994341e-01  1.39326109e+00  1.03036886e+00  9.86952101e-01
   8.30886909e-01]
 [-6.85994341e-01 -7.17740563e-01  1.09181921e+00 -4.56640246e-01
   7.76107440e-01]
 [ 1.45773797e+00 -7.17740563e-01  6.20398248e-01 -3.87599089e-01
   1.49807267e-01]
 [-6.85994341e-01 -7.17740563e-01  5.93085418e-01 -1.06553960e+00

In [6]:
#Train and test the model
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(x_train, y_train)


In [7]:
#predict  model
y_pred = model.predict(x_test)
print(y_pred)
print(y_test)

[126362.87908252  84608.45383643  99677.49425155  46357.46068582
 128750.48288497  50912.41741905 109741.350327   100643.24281644
  97599.275746   113097.42524437]
13    134307.35
39     81005.76
30     99937.59
45     64926.08
17    125370.37
48     35673.41
26    105733.54
25    107404.34
32     97427.84
19    122776.86
Name: Profit, dtype: float64


In [8]:
#check if model is good
mse = mean_squared_error(y_test, y_pred, squared = False)
print(mse)

model_score = model.score (x_test, y_test)*100
print(model_score)


9055.95732349781
89.87266414319838


In [9]:
#Saving my model using pickle
with open('model.pkl', 'wb') as file:
    pk.dump(model, file)
