In [21]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder

# Step 2: Load the dataset from the URL
url = "https://raw.githubusercontent.com/harimittapalli/Mulitple-Linear-Reggression/refs/heads/master/50_Startups.csv"
data = pd.read_csv(url)

# Step 3: Preprocess the data
# One-hot encode the 'State' column
onehot_encoder = OneHotEncoder(sparse_output=False)
encoded_state = onehot_encoder.fit_transform(data[['State']])
state_df = pd.DataFrame(encoded_state, columns=['State_' + str(int(i)) for i in range(encoded_state.shape[1])])
data = pd.concat([data, state_df], axis=1)
data.drop(['State'], axis=1, inplace=True)

# Define independent variables X (all columns except 'Profit') and dependent variable y ('Profit')
X = data.drop('Profit', axis=1)
y = data['Profit']

# Step 4: Train models with varying numbers of features and calculate MSE
results = []

# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train multiple models with varying numbers of features
for num_features in range(1, X_train.shape[1] + 1):
    model = LinearRegression()
    model.fit(X_train.iloc[:, :num_features], y_train)  # Train with first 'num_features' features
    y_pred = model.predict(X_test.iloc[:, :num_features])  # Predict with the same number of features
    mse = mean_squared_error(y_test, y_pred)  # Calculate MSE
    selected_features = X_train.columns[:num_features].tolist()  # Get the feature names
    results.append((num_features, selected_features, mse))

# Step 5: Create a DataFrame to display the results
result_df = pd.DataFrame(results, columns=['Num Features', 'Selected Features', 'MSE'])

# Display the results
print("\nResults for Different Numbers of Variables:")
print(result_df)

# Step 6: Find the best model with the lowest MSE
best_model = result_df.loc[result_df['MSE'].idxmin()]
best_num_features = best_model['Num Features']
best_features = best_model['Selected Features']
best_mse = best_model['MSE']

print(f"\nBest Model (Minimum MSE):")
print(f"Number of Features: {best_num_features}")
print(f"Selected Features: {best_features}")
print(f"Best MSE: {best_mse:.2f}")



Results for Different Numbers of Variables:
   Num Features                                  Selected Features  \
0             1                                        [R&D Spend]   
1             2                        [R&D Spend, Administration]   
2             3       [R&D Spend, Administration, Marketing Spend]   
3             4  [R&D Spend, Administration, Marketing Spend, S...   
4             5  [R&D Spend, Administration, Marketing Spend, S...   
5             6  [R&D Spend, Administration, Marketing Spend, S...   

            MSE  
0  5.951096e+07  
1  8.376413e+07  
2  8.092632e+07  
3  8.118573e+07  
4  8.201036e+07  
5  8.201036e+07  

Best Model (Minimum MSE):
Number of Features: 1
Selected Features: ['R&D Spend']
Best MSE: 59510962.81
