# `train_test_split()`

The `train_test_split() `function in scikit-learn's `sklearn.model_selection module` is commonly used to split a dataset into training and testing sets. This function is essential for evaluating machine learning models on unseen data and assessing their generalization performance

+ Function Signature:


train_test_split(*arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None)


+ Parameters:


+ `*arrays:` The input data to be split. This can be one or more arrays or matrices.

+ `test_size:` The proportion of the dataset to include in the test split. If not specified, the default value is 0.25.

+ `train_size:` The proportion of the dataset to include in the train split. If specified, test_size is ignored.

+ `random_state:` Controls the random seed for shuffling the data before splitting. This ensures reproducibility of the splits.

+ `shuffle:` Whether to shuffle the data before splitting. Default is True.

+ `stratify:` If not None, the data is split in a stratified fashion, preserving the proportion of samples in each class.

+ Returns:

A tuple containing the split datasets (X_train, X_test, y_train, y_test) or a list of arrays if more than one input array is provided.

# EX1

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

# Load dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the resulting datasets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


Shape of X_train: (120, 4)
Shape of X_test: (30, 4)
Shape of y_train: (120,)
Shape of y_test: (30,)


# EX2

In [2]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [3]:
df = pd.read_csv('RELIANCE.csv')

In [4]:
df.head()

Unnamed: 0,Date,series,OPEN,HIGH,LOW,PREV. CLOSE,ltp,close,vwap,52W H,52W L,VOLUME,VALUE,No of trades
0,27-Mar-2024,EQ,2896.0,3000.0,2894.0,2883.15,2983.75,2985.7,2972.19,3024.9,2220.3,8163322,24262983466.45,299800
1,26-Mar-2024,EQ,2890.0,2904.8,2878.0,2910.05,2886.7,2883.15,2887.28,3024.9,2220.3,5707953,16480430505.15,174805
2,22-Mar-2024,EQ,2899.95,2920.0,2894.7,2901.95,2906.55,2910.05,2909.59,3024.9,2220.3,9763804,28408677998.2,286746
3,21-Mar-2024,EQ,2905.05,2915.8,2889.35,2887.5,2900.05,2901.95,2903.52,3024.9,2220.3,6503468,18882974161.4,244846
4,20-Mar-2024,EQ,2855.9,2890.0,2848.05,2850.5,2886.25,2887.5,2874.53,3024.9,2220.3,4244403,12200651007.8,208132


In [5]:
df.columns

Index(['Date ', 'series ', 'OPEN ', 'HIGH ', 'LOW ', 'PREV. CLOSE ', 'ltp ',
       'close ', 'vwap ', '52W H ', '52W L ', 'VOLUME ', 'VALUE ',
       'No of trades '],
      dtype='object')

In [6]:
df.columns = df.columns.str.strip()


In [7]:
df.columns

Index(['Date', 'series', 'OPEN', 'HIGH', 'LOW', 'PREV. CLOSE', 'ltp', 'close',
       'vwap', '52W H', '52W L', 'VOLUME', 'VALUE', 'No of trades'],
      dtype='object')

In [8]:
df.drop(columns=['vwap'], inplace=True)

In [9]:
df.columns

Index(['Date', 'series', 'OPEN', 'HIGH', 'LOW', 'PREV. CLOSE', 'ltp', 'close',
       '52W H', '52W L', 'VOLUME', 'VALUE', 'No of trades'],
      dtype='object')

In [10]:
df.head(5)

Unnamed: 0,Date,series,OPEN,HIGH,LOW,PREV. CLOSE,ltp,close,52W H,52W L,VOLUME,VALUE,No of trades
0,27-Mar-2024,EQ,2896.0,3000.0,2894.0,2883.15,2983.75,2985.7,3024.9,2220.3,8163322,24262983466.45,299800
1,26-Mar-2024,EQ,2890.0,2904.8,2878.0,2910.05,2886.7,2883.15,3024.9,2220.3,5707953,16480430505.15,174805
2,22-Mar-2024,EQ,2899.95,2920.0,2894.7,2901.95,2906.55,2910.05,3024.9,2220.3,9763804,28408677998.2,286746
3,21-Mar-2024,EQ,2905.05,2915.8,2889.35,2887.5,2900.05,2901.95,3024.9,2220.3,6503468,18882974161.4,244846
4,20-Mar-2024,EQ,2855.9,2890.0,2848.05,2850.5,2886.25,2887.5,3024.9,2220.3,4244403,12200651007.8,208132


In [11]:
# Replace commas with empty strings in the entire DataFrame
df = df.replace(',', '', regex=True)

# Display the DataFrame after removing commas
print(df)

            Date series     OPEN     HIGH      LOW PREV. CLOSE      ltp  \
0    27-Mar-2024     EQ  2896.00  3000.00  2894.00     2883.15  2983.75   
1    26-Mar-2024     EQ  2890.00  2904.80  2878.00     2910.05  2886.70   
2    22-Mar-2024     EQ  2899.95  2920.00  2894.70     2901.95  2906.55   
3    21-Mar-2024     EQ  2905.05  2915.80  2889.35     2887.50  2900.05   
4    20-Mar-2024     EQ  2855.90  2890.00  2848.05     2850.50  2886.25   
..           ...    ...      ...      ...      ...         ...      ...   
120  04-Oct-2023     EQ  2309.00  2319.00  2295.10     2318.15  2314.10   
121  03-Oct-2023     EQ  2329.95  2335.60  2316.00     2345.00  2318.40   
122  29-Sep-2023     EQ  2341.80  2369.10  2334.10     2334.10  2342.05   
123  28-Sep-2023     EQ  2383.00  2383.00  2325.00     2368.90  2339.00   
124  27-Sep-2023     EQ  2343.50  2371.80  2338.50     2342.50  2371.00   

       close    52W H    52W L   VOLUME           VALUE No of trades  
0    2985.70  3024.90  2220.

In [12]:
df.head()

Unnamed: 0,Date,series,OPEN,HIGH,LOW,PREV. CLOSE,ltp,close,52W H,52W L,VOLUME,VALUE,No of trades
0,27-Mar-2024,EQ,2896.0,3000.0,2894.0,2883.15,2983.75,2985.7,3024.9,2220.3,8163322,24262983466.45,299800
1,26-Mar-2024,EQ,2890.0,2904.8,2878.0,2910.05,2886.7,2883.15,3024.9,2220.3,5707953,16480430505.15,174805
2,22-Mar-2024,EQ,2899.95,2920.0,2894.7,2901.95,2906.55,2910.05,3024.9,2220.3,9763804,28408677998.2,286746
3,21-Mar-2024,EQ,2905.05,2915.8,2889.35,2887.5,2900.05,2901.95,3024.9,2220.3,6503468,18882974161.4,244846
4,20-Mar-2024,EQ,2855.9,2890.0,2848.05,2850.5,2886.25,2887.5,3024.9,2220.3,4244403,12200651007.8,208132


In [13]:
df.columns

Index(['Date', 'series', 'OPEN', 'HIGH', 'LOW', 'PREV. CLOSE', 'ltp', 'close',
       '52W H', '52W L', 'VOLUME', 'VALUE', 'No of trades'],
      dtype='object')

In [14]:
X, y = df['Date'], df['close']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
print(X_train)

67     20-Dec-2023
12     07-Mar-2024
24     21-Feb-2024
45     20-Jan-2024
108    20-Oct-2023
          ...     
106    25-Oct-2023
14     05-Mar-2024
92     13-Nov-2023
51     12-Jan-2024
102    31-Oct-2023
Name: Date, Length: 100, dtype: object


In [16]:
print(X_test)

18     29-Feb-2024
42     25-Jan-2024
36     05-Feb-2024
76     07-Dec-2023
53     10-Jan-2024
91     15-Nov-2023
56     05-Jan-2024
123    28-Sep-2023
93     12-Nov-2023
4      20-Mar-2024
27     16-Feb-2024
116    10-Oct-2023
44     23-Jan-2024
96     08-Nov-2023
19     28-Feb-2024
77     06-Dec-2023
10     12-Mar-2024
101    01-Nov-2023
11     11-Mar-2024
114    12-Oct-2023
47     18-Jan-2024
78     05-Dec-2023
0      27-Mar-2024
26     19-Feb-2024
31     12-Feb-2024
Name: Date, dtype: object


In [17]:
# Convert 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Extract relevant date features
df['Day'] = df['Date'].dt.day
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year

# Drop the original 'Date' column
df.drop('Date', axis=1, inplace=True)

In [18]:
X = df[['OPEN', 'HIGH', 'LOW', 'PREV. CLOSE', 'ltp','52W H','52W L', 'VOLUME', 'VALUE']]

y = df['close']


X_train, X_test ,y_train, y_test = train_test_split(X, y,  test_size=0.2, random_state=42)

In [19]:
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)


Shape of X_train: (100, 9)
Shape of y_train: (100,)


In [20]:


# Create a linear regression model
model = LinearRegression()


In [21]:
try:
    # Train the model using the training data
    model.fit(X_train, y_train)
except Exception as e:
    print("Error occurred during model fitting:", e)

In [22]:
# Make predictions on the test data
predictions = model.predict(X_test)

print(predictions)

[2929.45092141 2712.23529921 2879.08849003 2455.09032563 2657.9805063
 2356.85590524 2610.61424576 2336.81940693 2331.1438709  2887.78250633
 2924.17228402 2311.6722103  2652.38964246 2334.02723224 2914.11555889
 2462.88920076 2956.94971615 2296.2627841  2926.82951394 2348.7373543
 2736.65794335 2436.44922773 2987.23849161 2942.02494473 2902.63527564]


In [23]:
# Evaluate the model's performance using mean squared error
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

Mean Squared Error: 14.54431093482756
