In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('FMCG_20companies.csv')
df.head()

Unnamed: 0,Quarters,Company,Total Income From Operations,Return on Equity Ratio,Price-Equity Ratio,Gross Margin,Profit Margin,EPS,Time Interest Earned Ratio,EV/Net Operating Revenue,Change in Inventory,CPI,Inflation,Closing Stock Price
0,Mar '20,Procter and Gamble,656.05,20.21,146.39,0.64,0.14,28.07,52.39,52.15,-16.18,148.6,0.058,4109.05
1,Dec '19,Procter and Gamble,859.27,26.47,103.92,0.65,0.16,41.88,82.25,39.81,21.2,150.4,0.074,4352.3
2,Sep '19,Procter and Gamble,852.14,26.25,111.13,0.59,0.16,42.16,504.83,40.15,-8.62,145.8,0.04,4685.15
3,Jun '19,Procter and Gamble,637.29,19.63,219.22,0.55,0.1,18.73,15.27,53.68,-25.4,142.9,0.032,4106.05
4,Mar '19,Procter and Gamble,699.34,21.54,132.34,0.61,0.13,27.76,258.37,45.39,25.8,140.4,0.029,3673.65


In [3]:
df.shape

(480, 14)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Quarters                    400 non-null    object 
 1   Company                     400 non-null    object 
 2   Return on Equity Ratio      400 non-null    float64
 3   Price-Equity Ratio          400 non-null    float64
 4   Gross Margin                400 non-null    float64
 5   Profit Margin               400 non-null    float64
 6   EPS                         400 non-null    float64
 7   Time Interest Earned Ratio  359 non-null    float64
 8   EV/Net Operating Revenue    400 non-null    float64
 9   Change in Inventory         400 non-null    float64
 10  CPI                         400 non-null    float64
 11  Inflation                   400 non-null    float64
 12  Closing Stock Price         400 non-null    float64
 13  Month                       400 non

## Train Test Split

In [7]:
df_test = df[df["Quarters"].isin(["Mar '20","Dec '19","Sep '19","Jun '19"])]
df_train = df[~df['Quarters'].isin(df_test['Quarters'])]

df_train.shape, df_test.shape

((400, 16), (80, 16))

In [8]:
y_train = df_train["Closing Stock Price"]
X_train = df_train.drop("Closing Stock Price",axis=1)
X_train.shape, y_train.shape

((400, 15), (400,))

## Preprocessing Function

In [9]:
def preprocessor(df):
  df['Month'],df['Year'] = df['Quarters'].str.split().str
  df['Year'] = df['Year'].str.replace("'","").astype(int)
  print("Quarter column split in Month and Year")
  df.drop("Quarters",axis=1,inplace=True)
  print("Quarter column dropped from dataframe")

## Split Categorical and Numerical variables

In [None]:
all_cols = list(X_train.columns)
cat_cols = list(X_train.select_dtypes("object").columns)
num_cols = list(set(all_cols)- set(cat_cols))
num_cols

['Price-Equity Ratio',
 'Return on Equity Ratio',
 'CPI',
 'Time Interest Earned Ratio',
 'Closing Stock Price',
 'Change in Inventory',
 'Inflation',
 'Gross Margin',
 'EPS',
 'Profit Margin',
 'EV/Net Operating Revenue']

## One Hot Encoding for Categorical Variables

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe_encoder = OneHotEncoder(handle_unknown='ignore')
ohe_encoder.fit(X_train[cat_cols])
ohe_encoder.categories_

[array(['Britannia', 'CCL Products India Ltd.', 'Colgate Palmolive',
        'Dabur India', 'Emami', 'Gillette India', 'Globus Spirits Ltd.',
        'Godrej Consumer', 'HUL', 'Hatsun Agro', 'Heritage Foods', 'ITC',
        'Jubiliant Foodworks', 'Marico', 'Nestle', 'Procter and Gamble',
        'Tasty Bite Eatables', 'Tata Consumer Products',
        'United Breweries Ltd.', 'United Spirits Ltd'], dtype=object),
 array(['Dec', 'Jun', 'Mar', 'Sep'], dtype=object)]

In [None]:
encoded_cat_names = list(ohe_encoder.get_feature_names(cat_cols))
encoded_cat_names

['Company_Britannia',
 'Company_CCL Products India Ltd.',
 'Company_Colgate Palmolive',
 'Company_Dabur India',
 'Company_Emami',
 'Company_Gillette India',
 'Company_Globus Spirits Ltd.',
 'Company_Godrej Consumer',
 'Company_HUL',
 'Company_Hatsun Agro',
 'Company_Heritage Foods',
 'Company_ITC',
 'Company_Jubiliant Foodworks',
 'Company_Marico',
 'Company_Nestle',
 'Company_Procter and Gamble',
 'Company_Tasty Bite Eatables',
 'Company_Tata Consumer Products',
 'Company_United Breweries Ltd.',
 'Company_United Spirits Ltd',
 'Month_Dec',
 'Month_Jun',
 'Month_Mar',
 'Month_Sep']

## Standard Scaling for Numerical Variables

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train[num_cols])

StandardScaler()

In [None]:
sc.feature_names_in_

array(['Price-Equity Ratio', 'Return on Equity Ratio', 'CPI',
       'Time Interest Earned Ratio', 'Closing Stock Price',
       'Change in Inventory', 'Inflation', 'Gross Margin', 'EPS',
       'Profit Margin', 'EV/Net Operating Revenue'], dtype=object)

## Creating Column Transformer

In [None]:
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(transformers=[('',preprocessor,all_cols)
    ('cat', ohe_encoder, cat_cols),
                                               ('sc', sc, num_cols)],
                                 remainder='passthrough')

## Linear Models

In [None]:
from sklearn.linear_model import LinearRegression
linear_reg = LinearRegression()

## Creating Pipeline

In [None]:
from sklearn.pipeline import Pipeline
lreg_v1 = Pipeline(steps=[('preprocessor', preprocessor),
                          ('regressor', linear_reg)],
                   verbose=True)

In [None]:
lreg_v1.fit(X_train, y_train)

ValueError: ignored