In [None]:
## Q1. Load the flight price dataset and examine its dimensions.

import pandas as pd

# Load the dataset
df = pd.read_csv('flight_price.csv')

# Examine the dimensions
rows, columns = df.shape
print(f"The dataset has {rows} rows and {columns} columns.")

In [None]:
## Q2. What is the distribution of flight prices in the dataset? Create a histogram to visualize the distribution.

import matplotlib.pyplot as plt

# Plot histogram of flight prices
plt.figure(figsize=(10, 6))
plt.hist(df['price'], bins=30, edgecolor='black')
plt.title('Distribution of Flight Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

In [None]:
## Q3. What is the range of prices in the dataset? What is the minimum and maximum price?
python
# Find the range of prices
min_price = df['price'].min()
max_price = df['price'].max()

print(f"The minimum price is {min_price} and the maximum price is {max_price}.")


In [None]:
## Q4. How does the price of flights vary by airline? Create a boxplot to compare the prices of different airlines.

# Plot boxplot of flight prices by airline
plt.figure(figsize=(12, 6))
df.boxplot(column='price', by='airline', grid=False)
plt.title('Flight Prices by Airline')
plt.suptitle('')
plt.xlabel('Airline')
plt.ylabel('Price')
plt.xticks(rotation=45)
plt.show()

In [None]:
## Q5. Are there any outliers in the dataset? Identify any potential outliers using a boxplot and describe how they may impact your analysis.

# Boxplot to identify outliers
plt.figure(figsize=(12, 6))
df.boxplot(column='price', grid=False)
plt.title('Boxplot of Flight Prices')
plt.xlabel('Flight Prices')
plt.ylabel('Price')
plt.show()

In [None]:
## Q6. Identifying the peak travel season. To identify the peak travel season, analyze features like date, month, and season.

# Convert date column to datetime if it's not already
df['date'] = pd.to_datetime(df['date'])

# Extract month and season from date
df['month'] = df['date'].dt.month
df['season'] = df['date'].dt.to_period('Q').astype(str)  # Assuming seasons are quarters

# Aggregate and visualize
monthly_counts = df['month'].value_counts().sort_index()
season_counts = df['season'].value_counts().sort_index()

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
monthly_counts.plot(kind='bar')
plt.title('Flights per Month')
plt.xlabel('Month')
plt.ylabel('Number of Flights')

plt.subplot(1, 2, 2)
season_counts.plot(kind='bar')
plt.title('Flights per Season')
plt.xlabel('Season')
plt.ylabel('Number of Flights')

plt.tight_layout()
plt.show()

In [None]:
## Q7. Identifying trends in flight prices.

# Analyze features like date, day_of_week, month, and year.

# Extract additional date features
df['day_of_week'] = df['date'].dt.dayofweek
df['year'] = df['date'].dt.year

# Aggregate prices by these features
daily_avg_price = df.groupby('day_of_week')['price'].mean()
monthly_avg_price = df.groupby('month')['price'].mean()
yearly_avg_price = df.groupby('year')['price'].mean()

# Visualize the trends
plt.figure(figsize=(18, 6))

plt.subplot(1, 3, 1)
daily_avg_price.plot(kind='bar')
plt.title('Average Price by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Average Price')

plt.subplot(1, 3, 2)
monthly_avg_price.plot(kind='bar')
plt.title('Average Price by Month')
plt.xlabel('Month')
plt.ylabel('Average Price')

plt.subplot(1, 3, 3)
yearly_avg_price.plot(kind='bar')
plt.title('Average Price by Year')
plt.xlabel('Year')
plt.ylabel('Average Price')

plt.tight_layout()
plt.show()


In [None]:
## Q8. Identifying factors that affect flight prices.

# Analyze features such as airline, destination, source, duration, day_of_week, month, class, etc.
import seaborn as sns

# Explore correlation matrix
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Visualize relationship with key features
plt.figure(figsize=(12, 6))
sns.boxplot(x='airline', y='price', data=df)
plt.title('Flight Prices by Airline')
plt.xticks(rotation=45)
plt.show()

# Additional visualizations
sns.lmplot(x='duration', y='price', data=df)
plt.title('Price vs. Duration')
plt.show()

sns.boxplot(x='class', y='price', data=df)
plt.title('Price by Class')
plt.show()

In [None]:
## Q9. Load the Google Playstore dataset and examine its dimensions. How many rows and columns does the dataset have?

import pandas as pd

# Load the dataset
df = pd.read_csv('GooglePlayStore.csv')

# Examine dimensions
rows, columns = df.shape
print(f'The dataset has {rows} rows and {columns} columns.')

In [None]:
## Q10. How does the rating of apps vary by category? Create a boxplot to compare the ratings of different app categories.

import seaborn as sns
import matplotlib.pyplot as plt

# Create a boxplot for ratings by category
plt.figure(figsize=(15, 8))
sns.boxplot(x='Category', y='Rating', data=df)
plt.xticks(rotation=90)
plt.title('Ratings by App Category')
plt.xlabel('App Category')
plt.ylabel('Rating')
plt.show()

In [None]:
## Q11. Are there any missing values in the dataset? Identify any missing values and describe how they may impact your analysis.

# Check for missing values
missing_values = df.isnull().sum()
print(missing_values)

# Identify missing values
print(f'Total missing values: {missing_values.sum()}')

# Missing values impact analysis by reducing the data available for analysis and potentially biasing results.

In [None]:
## Q12. What is the relationship between the size of an app and its rating? Create a scatter plot to visualize the relationship.

# Remove entries where Size is 'Varies with device'
df = df[df['Size'] != 'Varies with device']

# Convert Size to a numeric value
df['Size'] = df['Size'].apply(lambda x: float(x.replace('M', '').replace('k', '')) * (1000 if 'k' in x else 1))

# Create a scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Size', y='Rating', data=df)
plt.title('Relationship between App Size and Rating')
plt.xlabel('App Size (KB)')
plt.ylabel('Rating')
plt.show()

In [None]:
## Q13. How does the type of app affect its price? Create a bar chart to compare average prices by app type.

# Convert Price to numeric
df['Price'] = df['Price'].str.replace('$', '').astype(float)

# Group by Type and calculate average price
avg_price_by_type = df.groupby('Type')['Price'].mean().reset_index()

# Create a bar chart
plt.figure(figsize=(8, 6))
sns.barplot(x='Type', y='Price', data=avg_price_by_type)
plt.title('Average Price by App Type')
plt.xlabel('App Type')
plt.ylabel('Average Price ($)')
plt.show()

In [None]:
## Q14. What are the top 10 most popular apps in the dataset? Create a frequency table to identify the apps with the highest number of installs.

# Convert Installs to numeric
df['Installs'] = df['Installs'].str.replace('[+,]', '').astype(int)

# Identify top 10 most popular apps
top_10_apps = df.nlargest(10, 'Installs')[['App', 'Installs']]

# Display the top 10 apps
print(top_10_apps)

In [None]:
## Q15. How to identify the most popular app categories for a new app launch?

# Analyze Installs by Category
category_installs = df.groupby('Category')['Installs'].sum().reset_index().sort_values(by='Installs', ascending=False)

# Visualize the data
plt.figure(figsize=(15, 8))
sns.barplot(x='Category', y='Installs', data=category_installs)
plt.xticks(rotation=90)
plt.title('Total Installs by App Category')
plt.xlabel('App Category')
plt.ylabel('Total Installs')
plt.show()

In [None]:
## Q16. Identifying the most successful app developers

# Analyze by Developer
developer_success = df.groupby('Developer')['Installs'].sum().reset_index().sort_values(by='Installs', ascending=False)

# Visualize the data
top_developers = developer_success.head(10)

plt.figure(figsize=(15, 8))
sns.barplot(x='Developer', y='Installs', data=top_developers)
plt.xticks(rotation=90)
plt.title('Top 10 Developers by Installs')
plt.xlabel('Developer')
plt.ylabel('Total Installs')
plt.show()

In [None]:
## Q17. Identifying the best time to launch a new app

# Analyze the dataset to identify trends over time
# Assuming dataset has 'Last Updated' field which will be used to analyze trends over time

df['Last Updated'] = pd.to_datetime(df['Last Updated'])
df['Year'] = df['Last Updated'].dt.year

# Analyze Installs by Year
installs_by_year = df.groupby('Year')['Installs'].sum().reset_index()

# Visualize the data
plt.figure(figsize=(10, 6))
sns.lineplot(x='Year', y='Installs', data=installs_by_year)
plt.title('Total Installs by Year')
plt.xlabel('Year')
plt.ylabel('Total Installs')
plt.show()