This tasks involved creating and manipulating a DataFrame using pandas to perform selection, filtering, sorting, handling missing values, and grouping.

In [None]:
# from Google drive to Colab
from google.colab import drive # importing the file from Google drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#'import pandas as pd' loads the pandas library and assigns it the alias pd so we can use its data analysis functions like 'pd.read_csv()' easily.
import pandas as pd
file_path = '/content/drive/MyDrive/Airbnb2/AB_NYC_2019 - AB_NYC_2019.csv'
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [None]:
# 2. Create a DataFrame
import pandas as pd

data = {
    'Name': ['John', 'Alice', 'Bob', 'Charlie', 'David'],
    'Age': [23, 30, 25, 35, 28],
    'City': ['New York', 'Los Angeles', 'Chicago', 'San Francisco', 'New York'],
    'Salary': [50000, 70000, 45000, 80000, 55000]
}

df = pd.DataFrame(data)  # Create the DataFrame using the given data
print(df)

      Name  Age           City  Salary
0     John   23       New York   50000
1    Alice   30    Los Angeles   70000
2      Bob   25        Chicago   45000
3  Charlie   35  San Francisco   80000
4    David   28       New York   55000


In [None]:
# 3. Print the DataFrame to confirm it has been created
print("Full DataFrame:")
print(df)  # Display the full DataFrame

Full DataFrame:
      Name  Age           City  Salary
0     John   23       New York   50000
1    Alice   30    Los Angeles   70000
2      Bob   25        Chicago   45000
3  Charlie   35  San Francisco   80000
4    David   28       New York   55000


In [None]:
# 4b. Show the first 3 rows of the DataFrame
print("\nFirst 3 rows:")
print(df.head(3))  # Display first 3 rows using head()


First 3 rows:
    Name  Age         City  Salary
0   John   23     New York   50000
1  Alice   30  Los Angeles   70000
2    Bob   25      Chicago   45000


In [None]:
# 4c. Find the average salary
average_salary = df['Salary'].mean()  # Compute mean of the Salary column
print("\nAverage Salary:", average_salary)


Average Salary: 60000.0


In [None]:
# 4d. Filter and display rows where Age >= 30
print("\nRows where Age >= 30:")
print(df[df['Age'] >= 30])  # Filter DataFrame based on Age condition



Rows where Age >= 30:
      Name  Age           City  Salary
1    Alice   30    Los Angeles   70000
3  Charlie   35  San Francisco   80000


In [None]:
# 5a. Sort by Salary in descending order
print("\nSorted by Salary (descending):")
print(df.sort_values(by='Salary', ascending=False))  # Sort by Salary descending


Sorted by Salary (descending):
      Name  Age           City  Salary
3  Charlie   35  San Francisco   80000
1    Alice   30    Los Angeles   70000
4    David   28       New York   55000
0     John   23       New York   50000
2      Bob   25        Chicago   45000


In [None]:
# 5b. Sort by City and then by Age (both ascending)
print("\nSorted by City and Age:")
print(df.sort_values(by=['City', 'Age']))  # Sort by City, then by Age ascending



Sorted by City and Age:
      Name  Age           City  Salary
2      Bob   25        Chicago   45000
1    Alice   30    Los Angeles   70000
0     John   23       New York   50000
4    David   28       New York   55000
3  Charlie   35  San Francisco   80000


In [None]:
# 6. Introduce missing values
# we used none here to simulate missing data in the DataFrame.
df.loc[2, 'Salary'] = None  # Set Salary for Bob to None
df.loc[4, 'City'] = None  # Set City for David to None

In [None]:
# 6a. Display DataFrame with missing values
print("\nDataFrame with missing values:")
print(df)


DataFrame with missing values:
      Name  Age           City   Salary
0     John   23       New York  50000.0
1    Alice   30    Los Angeles  70000.0
2      Bob   25        Chicago      NaN
3  Charlie   35  San Francisco  80000.0
4    David   28           None  55000.0


In [None]:
# 6b. Fill missing salary with average (excluding NaN)
mean_salary = df['Salary'].mean()  # Compute mean excluding NaN
df['Salary'] = df['Salary'].fillna(mean_salary)  # Fill missing Salary values
print("\nDataFrame after filling missing Salary with average:")
print(df)


DataFrame after filling missing Salary with average:
      Name  Age           City   Salary
0     John   23       New York  50000.0
1    Alice   30    Los Angeles  70000.0
2      Bob   25        Chicago  63750.0
3  Charlie   35  San Francisco  80000.0
4    David   28           None  55000.0


In [None]:
# 6c. Fill missing City with 'Unknown'
df['City'] = df['City'].fillna('Unknown')  # Fill missing City values with 'Unknown'
print("\nDataFrame after filling missing City with 'Unknown':")
print(df)


DataFrame after filling missing City with 'Unknown':
      Name  Age           City   Salary
0     John   23       New York  50000.0
1    Alice   30    Los Angeles  70000.0
2      Bob   25        Chicago  63750.0
3  Charlie   35  San Francisco  80000.0
4    David   28        Unknown  55000.0


In [None]:
# 7. Save cleaned data to CSV file
df.to_csv('Cleaned_data.csv', index=False)  # Save DataFrame to a CSV file without the index

In [None]:
# Bonus: Group by City and find average salary
grouped = df.groupby('City')['Salary'].mean()  # Group by City and calculate average salary
print("\nAverage salary by City:")
print(grouped)


Average salary by City:
City
Chicago          63750.0
Los Angeles      70000.0
New York         50000.0
San Francisco    80000.0
Unknown          55000.0
Name: Salary, dtype: float64
