In [2]:
import pandas as pd
import numpy as np

# Load datasets
apps = pd.read_csv(r'C:\Users\holar\Documents\Excel Project\apps.csv') 
user_reviews = pd.read_csv(r'C:\Users\holar\Documents\Excel Project\user_reviews.csv')

print("Apps Dataset Preview:")
print(apps.head())

print("\nUser Reviews Dataset Preview:")
print(user_reviews.head())

Apps Dataset Preview:
   Unnamed: 0                                                App  \
0           0     Photo Editor & Candy Camera & Grid & ScrapBook   
1           1                                Coloring book moana   
2           2  U Launcher Lite – FREE Live Cool Themes, Hide ...   
3           3                              Sketch - Draw & Paint   
4           4              Pixel Draw - Number Art Coloring Book   

         Category  Rating  Reviews  Size     Installs  Type Price  \
0  ART_AND_DESIGN     4.1      159  19.0      10,000+  Free     0   
1  ART_AND_DESIGN     3.9      967  14.0     500,000+  Free     0   
2  ART_AND_DESIGN     4.7    87510   8.7   5,000,000+  Free     0   
3  ART_AND_DESIGN     4.5   215644  25.0  50,000,000+  Free     0   
4  ART_AND_DESIGN     4.3      967   2.8     100,000+  Free     0   

  Content Rating                     Genres      Last Updated  \
0       Everyone               Art & Design   January 7, 2018   
1       Everyone  Art & 

In [8]:
# Drop duplicates
apps.drop_duplicates(subset='App', inplace=True)

# Clean 'Size' column
def size_to_mb(size):
    if pd.isnull(size):  # Handle NaN values
        return np.nan
    if isinstance(size, str):  # Ensure size is a string
        if 'M' in size:
            return float(size.replace('M', ''))
        elif 'k' in size:
            return float(size.replace('k', '')) / 1024  # Convert KB to MB
    return np.nan  # Return NaN for other cases

apps['Size'] = apps['Size'].replace('Varies with device', np.nan).apply(size_to_mb)

# Clean 'Price' column
apps['Price'] = apps['Price'].astype(str).str.replace('$', '').replace('Free', '0').astype(float)

# Handle missing ratings by filling with the mean
mean_rating = apps['Rating'].mean()
apps['Rating'] = apps['Rating'].fillna(mean_rating)

# Convert 'Reviews' to integer
apps['Reviews'] = apps['Reviews'].astype(int)

print("\nCleaned Apps Dataset:")
print(apps.info())


Cleaned Apps Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9659 entries, 0 to 9658
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      9659 non-null   int64  
 1   App             9659 non-null   object 
 2   Category        9659 non-null   object 
 3   Rating          9659 non-null   float64
 4   Reviews         9659 non-null   int32  
 5   Size            0 non-null      float64
 6   Installs        9659 non-null   object 
 7   Type            9659 non-null   object 
 8   Price           9659 non-null   float64
 9   Content Rating  9659 non-null   object 
 10  Genres          9659 non-null   object 
 11  Last Updated    9659 non-null   object 
 12  Current Ver     9651 non-null   object 
 13  Android Ver     9657 non-null   object 
dtypes: float64(3), int32(1), int64(1), object(9)
memory usage: 1018.9+ KB
None


In [9]:
print(apps.head())

   Unnamed: 0                                                App  \
0           0     Photo Editor & Candy Camera & Grid & ScrapBook   
1           1                                Coloring book moana   
2           2  U Launcher Lite – FREE Live Cool Themes, Hide ...   
3           3                              Sketch - Draw & Paint   
4           4              Pixel Draw - Number Art Coloring Book   

         Category  Rating  Reviews  Size     Installs  Type  Price  \
0  ART_AND_DESIGN     4.1      159   NaN      10,000+  Free    0.0   
1  ART_AND_DESIGN     3.9      967   NaN     500,000+  Free    0.0   
2  ART_AND_DESIGN     4.7    87510   NaN   5,000,000+  Free    0.0   
3  ART_AND_DESIGN     4.5   215644   NaN  50,000,000+  Free    0.0   
4  ART_AND_DESIGN     4.3      967   NaN     100,000+  Free    0.0   

  Content Rating                     Genres      Last Updated  \
0       Everyone               Art & Design   January 7, 2018   
1       Everyone  Art & Design;Pretend P

In [10]:
# Fill missing 'Sentiment' values with 'Neutral'
user_reviews['Sentiment'] = user_reviews['Sentiment'].fillna('Neutral')
user_reviews['Sentiment_Polarity'] = user_reviews['Sentiment_Polarity'].fillna(0)
user_reviews['Sentiment_Subjectivity'] = user_reviews['Sentiment_Subjectivity'].fillna(0)

print("\nCleaned User Reviews Dataset:")
print(user_reviews.info())


Cleaned User Reviews Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64295 entries, 0 to 64294
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   App                     64295 non-null  object 
 1   Translated_Review       37427 non-null  object 
 2   Sentiment               64295 non-null  object 
 3   Sentiment_Polarity      64295 non-null  float64
 4   Sentiment_Subjectivity  64295 non-null  float64
dtypes: float64(2), object(3)
memory usage: 2.5+ MB
None


In [18]:
# Merge datasets on 'App'
merged_data = pd.merge(apps, user_reviews, on='App', how='inner')

# Preview the merged dataset
print("\nMerged Dataset Preview:")
print(merged_data.head())


Merged Dataset Preview:
   Unnamed: 0                  App        Category  Rating  Reviews  Size  \
0           1  Coloring book moana  ART_AND_DESIGN     3.9      967   NaN   
1           1  Coloring book moana  ART_AND_DESIGN     3.9      967   NaN   
2           1  Coloring book moana  ART_AND_DESIGN     3.9      967   NaN   
3           1  Coloring book moana  ART_AND_DESIGN     3.9      967   NaN   
4           1  Coloring book moana  ART_AND_DESIGN     3.9      967   NaN   

   Installs  Type  Price Content Rating                     Genres  \
0  500,000+  Free    0.0       Everyone  Art & Design;Pretend Play   
1  500,000+  Free    0.0       Everyone  Art & Design;Pretend Play   
2  500,000+  Free    0.0       Everyone  Art & Design;Pretend Play   
3  500,000+  Free    0.0       Everyone  Art & Design;Pretend Play   
4  500,000+  Free    0.0       Everyone  Art & Design;Pretend Play   

       Last Updated Current Ver   Android Ver  \
0  January 15, 2018       2.0.0  4.0.3 and

In [19]:
# Category distribution
category_counts = merged_data['Category'].value_counts()
print("\nCategory Distribution:")
print(category_counts)


Category Distribution:
Category
GAME                   10540
FAMILY                  3969
HEALTH_AND_FITNESS      3456
DATING                  3098
PRODUCTIVITY            2760
TOOLS                   2612
SPORTS                  2600
PHOTOGRAPHY             2600
TRAVEL_AND_LOCAL        2500
COMMUNICATION           2240
MEDICAL                 2234
FINANCE                 2200
ENTERTAINMENT           2180
SHOPPING                1920
EDUCATION               1860
PERSONALIZATION         1800
BUSINESS                1600
LIFESTYLE               1563
NEWS_AND_MAGAZINES      1560
BOOKS_AND_REFERENCE     1112
SOCIAL                   980
FOOD_AND_DRINK           951
HOUSE_AND_HOME           899
AUTO_AND_VEHICLES        603
BEAUTY                   591
ART_AND_DESIGN           548
VIDEO_PLAYERS            500
LIBRARIES_AND_DEMO       480
PARENTING                440
MAPS_AND_NAVIGATION      440
EVENTS                   280
WEATHER                  240
COMICS                   200
Name: coun

In [20]:
# Correlation between Rating, Reviews, and Size
correlation_matrix = merged_data[['Rating', 'Reviews', 'Size']].corr()
print("\nCorrelation Matrix:")
print(correlation_matrix)


Correlation Matrix:
           Rating   Reviews  Size
Rating   1.000000  0.093589   NaN
Reviews  0.093589  1.000000   NaN
Size          NaN       NaN   NaN


In [21]:
# Sentiment counts
sentiment_counts = merged_data['Sentiment'].value_counts()
print("\nSentiment Distribution:")
print(sentiment_counts)


Sentiment Distribution:
Sentiment
Neutral     30478
Positive    23073
Negative     8005
Name: count, dtype: int64


In [22]:
merged_data.to_csv(r'C:\Users\holar\Documents\Excel Project\merged_data.csv', index=False)