**Import Libraries :**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

**Load Dataset**

In [None]:
df = pd.read_csv('googleplaystore.csv')
df.head()

**Basic Data Exploration**

In [None]:
df.shape
df.columns
df.info()
df.describe()

**Data Cleaning**

In [None]:
# Removing row with incorrect rating value
df.drop(10472, axis=0, inplace=True)

df['Reviews'] = df['Reviews'].astype(int)

# Function to convert size to bytes
def convert_into_bytes(size):
    if isinstance(size, str):
        if 'k' in size:
            return float(size.replace('k','')) * 1024
        elif 'M' in size:
            return float(size.replace('M','')) * 1024 * 1024
        elif 'Varies with device' in size:
            return np.nan
    return size

df['Size'] = df['Size'].apply(convert_into_bytes)

df['Installs'] = df['Installs'].str.replace('+','')
df['Installs'] = df['Installs'].str.replace(',','')
df['Installs'] = df['Installs'].astype(int)

bins = [-1, 0, 10, 1000, 10000, 100000, 1000000, 10000000, 10000000000]
labels = ['no','Very low','Low','Moderate','More than moderate','High','Very High','Top Notch']

df['Installs_category'] = pd.cut(df['Installs'], bins=bins, labels=labels)

df['Price'] = df['Price'].str.replace('$','')
df['Price'] = df['Price'].astype(float)

df.isnull().sum()
# Heatmap for missing values
plt.figure(figsize=(14,6))
sns.heatmap(df.isnull(), cbar=False)
plt.title("Missing Values Heatmap")
plt.show()

df.dropna(subset=['Current Ver', 'Android Ver', 'Type', 'Genres'], inplace=True)


**Correlation Analysis**

In [None]:
numeric_cols = ['Rating', 'Reviews', 'Size', 'Installs', 'Price']
corr = df[numeric_cols].corr()

plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()


**Fill Missing Ratings Smartly**

In [None]:
rating_means = df.groupby('Installs_category')['Rating'].mean()
rating_means
for category in rating_means.index:
    df.loc[
        (df['Rating'].isnull()) & (df['Installs_category'] == category),
        'Rating'
    ] = rating_means[category]
df['Rating'].isnull().sum()

**Remove Duplicates**

In [None]:
df.duplicated().sum()
df.drop_duplicates(inplace=True)
df.shape

**Rating Distribution**

In [None]:
plt.figure(figsize=(8,5))
sns.histplot(df['Rating'], kde=True)
plt.title("Distribution of App Ratings")
plt.show()


**Apps Count by Category**

In [None]:
plt.figure(figsize=(12,8))
df['Category'].value_counts().head(10).plot(kind='bar')
plt.title("Top 10 App Categories")
plt.show()

**Reviews vs Installs (Log Scale)**

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(
    x=np.log10(df['Reviews']+1),
    y=np.log10(df['Installs']+1)
)
plt.title("Reviews vs Installs (Log Scale)")
plt.xlabel("Log Reviews")
plt.ylabel("Log Installs")
plt.show()

**Key Insights**

**1. Highest Number of Apps**

In [None]:
df['Category'].value_counts().head(10)

**2. Highest Installs**

In [None]:
df.groupby('Category')['Installs'].sum().sort_values(ascending=False).head(10)

**3. Highest Reviews**

In [None]:
df.groupby('Category')['Reviews'].sum().sort_values(ascending=False).head(10)

**4. Highest Average Rating**

In [None]:
df.groupby('Category')['Rating'].mean().sort_values(ascending=False).head(10)

### Conclusion

- GAME and COMMUNICATION categories dominate installs and reviews
- Higher installs lead to higher reviews
- Rating improves with popularity
- Paid apps do not guarantee success
- Size has weak correlation with installs




**Select Features for ML**

In [None]:
ml_df = df[['Reviews', 'Size', 'Installs', 'Price', 'Rating']].copy()
ml_df.head()

**Handle Remaining Missing Values**

In [None]:
ml_df.isnull().sum()
ml_df['Size'].fillna(ml_df['Size'].median(), inplace=True)
ml_df['Rating'].fillna(ml_df['Rating'].median(), inplace=True)

**Split Features & Target**

In [None]:
X = ml_df.drop('Rating', axis=1)
y = ml_df['Rating']

**Train-Test Split**

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

**Feature Scaling**

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

**Train Linear Regression Model**

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

**Make Predictions**

In [None]:
y_pred = lr.predict(X_test_scaled)

**Model Evaluation**

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("MAE:", mae)
print("RMSE:", rmse)
print("R2 Score:", r2)

**Actual vs Predicted Plot**

In [None]:
plt.figure(figsize=(6,6))
plt.scatter(y_test, y_pred)
plt.xlabel("Actual Rating")
plt.ylabel("Predicted Rating")
plt.title("Actual vs Predicted Ratings")
plt.show()

**Feature Importance**

In [None]:
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lr.coef_
}).sort_values(by='Coefficient', ascending=False)

feature_importance

### Machine Learning Conclusion

- Linear Regression was used to predict app ratings
- Reviews and Installs have the strongest impact on ratings
- Price has minimal effect on rating
- Model performs reasonably well for real-world app data
- This model can help developers estimate expected ratings before launch
