In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [None]:
plt.style.use('seaborn-v0_8-whitegrid') 
sns.set_context("notebook", font_scale=1.5)
sns.set_palette('viridis')
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
try:
    data = pd.read_csv('Dataset .csv')
    print("Dataset shape:", data.shape)
    print("Dataset info:")
    display(data.info())
    display(data.head())
    
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
print("Missing values in the dataset:")
plt.figure(figsize=(10, 6))
missing = data.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
if len(missing) > 0:
    sns.barplot(x=missing.index, y=missing.values)
    plt.title('Missing Values by Column')
    plt.xticks(rotation=0, ha='center')
    plt.xlabel('Columns')
    plt.ylabel('Count')
    plt.tight_layout()
    
else:
    print("No missing values in the dataset.")

In [None]:
data['Cuisines'].fillna(data['Cuisines'].mode()[0], inplace=True)

In [None]:

categorical_features = ['Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu', 'Rating color', 'Rating text']
data_encoded = pd.get_dummies(data, columns=categorical_features, drop_first=True)


In [None]:
X = data_encoded.drop('Aggregate rating', axis=1)  
y = data_encoded['Aggregate rating']              


text_columns = ['Restaurant Name', 'Address']
X_numeric = X.drop(columns=text_columns, errors='ignore')

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

In [None]:


# Filter only numeric columns for model training
X_train_numeric = X_train.select_dtypes(include=['int64', 'float64', 'bool']).copy()
X_test_numeric = X_test.select_dtypes(include=['int64', 'float64', 'bool']).copy()

print("\n### Training Linear Regression model ###")
print(f"Using {X_train_numeric.shape[1]} numeric features for regression")
linear_model = LinearRegression()

linear_model.fit(X_train_numeric, y_train)
y_pred = linear_model.predict(X_test_numeric)
r2 = r2_score(y_test, y_pred)

# Create feature importance dataframe for visualization in later cells
feature_importance = pd.DataFrame({
	'Feature': X_train_numeric.columns,
	'Coefficient': linear_model.coef_
})

In [None]:
print("Traning complete.")
print(f"Linear Regression Model Performance:")
print(f"  R²: {r2:.4f}")

In [None]:
# Enhanced visualization cell for your notebook

# 1. Feature importance visualization - simplified and focused
feature_importance['Absolute_Coefficient'] = np.abs(feature_importance['Coefficient'])
top_features = feature_importance.sort_values('Absolute_Coefficient', ascending=False).head(10)

plt.figure(figsize=(10, 7))
sns.barplot(x='Absolute_Coefficient', y='Feature', data=top_features)
plt.title("Most Influential Features for Restaurant Ratings", fontsize=16)
plt.xlabel("Impact Magnitude", fontsize=14)
plt.ylabel("Features", fontsize=14)
plt.tight_layout()
plt.show()

# 2. Feature correlation with target - single clear visualization
numeric_columns = ['Longitude', 'Latitude', 'Average Cost for two', 'Votes', 'Price range', 'Aggregate rating']
df_numeric = data[numeric_columns]

plt.figure(figsize=(9, 7))
corr_matrix = df_numeric.corr()
mask = np.zeros_like(corr_matrix)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", mask=mask)
plt.title("Feature Correlations", fontsize=16)
plt.tight_layout()
plt.show()

# 3. Actual vs Predicted plot - essential for model evaluation
plt.figure(figsize=(9, 6))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Ratings', fontsize=14)
plt.ylabel('Predicted Ratings', fontsize=14)
plt.title(f'Model Accuracy (R² = {r2:.3f})', fontsize=16)
plt.tight_layout()
plt.show()

# 4. Geographic insights - maintained for location analysis
plt.figure(figsize=(12, 8))
scatter = plt.scatter(data['Longitude'], data['Latitude'], 
                     c=data['Aggregate rating'], cmap='viridis', 
                     alpha=0.6, s=data['Votes']/50)
plt.colorbar(scatter, label='Rating')
plt.title('Geographic Distribution of Restaurant Ratings', fontsize=16)
plt.xlabel('Longitude', fontsize=14)
plt.ylabel('Latitude', fontsize=14)
plt.tight_layout()
plt.show()

# 5. Rating distribution - simplified
if 'Rating text' in data.columns:
    plt.figure(figsize=(10, 6))
    ax = sns.countplot(x='Rating text', data=data, palette='viridis')
    plt.title('Distribution of Rating Categories', fontsize=16)
    plt.xlabel('Rating Category', fontsize=14)
    plt.ylabel('Number of Restaurants', fontsize=14)
    
    # Add count labels on top of bars
    for p in ax.patches:
        ax.annotate(f'{int(p.get_height())}', 
                    (p.get_x() + p.get_width()/2., p.get_height()), 
                    ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()