In [14]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Target URL (Update with your specific search URL)
base_url = "https://www.kijiji.ca/b-real-estate/thunder-bay/c34l1700126?radius=50.0&address=Lakehead+University%2C+Oliver+Road%2C+Thunder+Bay%2C+ON&ll=48.42111080000001%2C-89.2606994"
# Headers to avoid bot detection
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

In [16]:
data = []
for page in range(1, 10):
    print(f"Scraping page {page}...")
    URL = f"{base_url}{page}"  
    response = requests.get(URL, headers=HEADERS)
    soup = BeautifulSoup(response.text, "html.parser")

    listings = soup.find_all("div", class_="sc-78b01f6c-8 dFACuF")
    

    for listing in listings:
        # Extract title
        title_tag = listing.find("h3", {"data-testid": "listing-title"})
        title = title_tag.text.strip() if title_tag else "N/A"

        # Extract price
        price_tag = listing.find("p", {"data-testid": "listing-price"})
        price = price_tag.text.strip() if price_tag else "N/A"
        
        # Extract description
        description_tag = listing.find("p", {"data-testid": "listing-description"})
        description = description_tag.text.strip() if description_tag else "N/A"

        proximity_tag = listing.find("p", {"data-testid": "listing-proximity"})
        proximity = proximity_tag.text.strip() if proximity_tag else "N/A"
    
        posted_tags = listing.find_all("p", class_="sc-578235a5-0 cOyyWk")  
        posted = posted_tags[2].text.strip() if len(posted_tags) > 2 else "N/A" 


        details_tags = listing.find_all("p", class_="sc-578235a5-0 hLtuc")
        # Get the second <p> element if it exists
        nearest = details_tags[0].text.strip() if len(details_tags) > 0 else "N/A"  
        bedrooms = details_tags[1].text.strip() if len(details_tags) > 1 else "N/A"  
        bathrooms= details_tags[2].text.strip() if len(details_tags) > 2 else "N/A"
        type=details_tags[3].text.strip() if len(details_tags) > 3 else "N/A"
        parking=details_tags[4].text.strip() if len(details_tags) > 4 else "N/A"
        size=details_tags[5].text.strip() if len(details_tags) > 5 else "N/A"


        # Append data to list
        data.append({
            "Title": title,
            "Price": price,
            "Posted_Date": posted,
            "Description": description,
            "Distance": proximity,
            "Nearest_Intersection": nearest,
            "Bedrooms": bedrooms,
            "Bathrooms": bathrooms,
            "Unit_type": type,
            "Parking": parking,
            "Size(sqft)": size
            
        
        })

    # Convert to Pandas DataFrame
df = pd.DataFrame(data)

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...


In [18]:
print(df)

Empty DataFrame
Columns: []
Index: []


In [513]:
df.to_csv("kijiji_listings2.csv", index=False)
print("Data saved to kijiji_listings.csv")

Data saved to kijiji_listings.csv


In [515]:
print(df.columns.tolist())


['Title', 'Price', 'Posted_Date', 'Description', 'Distance', 'Nearest_Intersection', 'Bedrooms', 'Bathrooms', 'Unit_type', 'Parking', 'Size(sqft)']


In [517]:
df = df.drop("Posted_Date", axis=1)

In [519]:
df.replace("N/A", pd.NA, inplace=True)
df.replace("Please Contact", pd.NA, inplace=True)
print("Null values in each column:")
print(df.isnull().sum())

Null values in each column:
Title                     0
Price                    45
Description               0
Distance                  0
Nearest_Intersection    252
Bedrooms                252
Bathrooms               261
Unit_type               261
Parking                 261
Size(sqft)              270
dtype: int64


In [521]:
print("Original DataFrame:")
print(df)

# Create a separate dataset with rows that have null values
null_rows = df[df.isnull().any(axis=1)]  # This will select rows with any null values

# Print the new DataFrame with null values
print("\nDataFrame with rows containing null values:")
print(null_rows)

Original DataFrame:
                                  Title                                \
0                                             Single bedroom for rent   
1                                7 Unit Apartment Building for Sale!!   
2                                      0 HWY 130 Thunder Bay, Ontario   
3                                           Room for Rent furnished!!   
4                                 Two bed one bath basement apartment   
5                                                Two bed 1 bath house   
6                                                       Room for rent   
7                          Roommate needed to share a room for female   
8                                  350 Mokomon rd, Kakabeka Falls, ON   
9                      Two bedroom apartment with utilities included.   
10                    Room for Rent - All Inclusive - Fully Furnished   
11                     furnished private room 3 minutes to university   
12                             

In [523]:
null_rows.to_csv('null_values_dataset2.csv', index=False)

In [525]:
df["Price"] = df["Price"].str.replace(r"[^\d]", "", regex=True)  # Remove non-numeric characters
df["Price"] = pd.to_numeric(df["Price"], errors="coerce")  # Convert to numbers
print(df["Price"].head(10))

0        60000.0
1    120000000.0
2     27790000.0
3        85000.0
4       170000.0
5       210000.0
6        75000.0
7        36700.0
8     74000000.0
9       195000.0
Name: Price, dtype: float64


In [527]:
df["Distance"] = df["Distance"].str.replace(r"[^\d]", "", regex=True)  # Remove non-numeric characters
df["Distance"] = pd.to_numeric(df["Distance"], errors="coerce")  # Convert to numbers
print(df["Distance"].head(10))

0     4
1     5
2    13
3     5
4     4
5     4
6     4
7     4
8    33
9     4
Name: Distance, dtype: int64


In [529]:
df["Size(sqft)"] = df["Size(sqft)"].str.replace(r"[^\d]", "", regex=True)  # Remove non-numeric characters
df["Size(sqft)"] = pd.to_numeric(df["Size(sqft)"], errors="coerce")  # Convert to numbers
print(df["Size(sqft)"].head(10))

0    1500.0
1       NaN
2       NaN
3       NaN
4    1500.0
5    1500.0
6       NaN
7       NaN
8       NaN
9    1000.0
Name: Size(sqft), dtype: float64


In [531]:
print(df.head(10))  # Inspect extracted prices

                       Title                          Price     \
0                         Single bedroom for rent      60000.0   
1            7 Unit Apartment Building for Sale!!  120000000.0   
2                  0 HWY 130 Thunder Bay, Ontario   27790000.0   
3                       Room for Rent furnished!!      85000.0   
4             Two bed one bath basement apartment     170000.0   
5                            Two bed 1 bath house     210000.0   
6                                   Room for rent      75000.0   
7      Roommate needed to share a room for female      36700.0   
8              350 Mokomon rd, Kakabeka Falls, ON   74000000.0   
9  Two bedroom apartment with utilities included.     195000.0   

                                                                                                 Description                                                                                                  \
0      Single bedroom available for rent at Robertson Street.. Ex

In [533]:
df = df.dropna()
print(df.head()) 

                        Title                         Price   \
0                          Single bedroom for rent   60000.0   
4              Two bed one bath basement apartment  170000.0   
5                             Two bed 1 bath house  210000.0   
9   Two bedroom apartment with utilities included.  195000.0   
13     Updated Two Bedroom Current River Townhouse  160000.0   

                                                                                                  Description                                                                                                  \
0       Single bedroom available for rent at Robertson Street.. Excellent renting opportunity for students. -6 minute drive to Confederation college Rent: $600/month including utilities. First and last ...   
4      Two bed 1 bath basement apartment available for rent in Northwood. Please read the advert, serious inquiries only. Close to Confederation College, and Airport. Walking distance to bus routes

In [535]:
df.to_csv("kijiji_cleaned_listings2.csv", index=False)
print("Data saved to kijiji_cleaned_listings.csv")

Data saved to kijiji_cleaned_listings.csv


In [551]:
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('kijiji_cleaned_listings.csv')
plt.figure(figsize=(8, 5))
sns.histplot(df["Price"], bins=30)

# Add title and labels
plt.yticks(np.arange(0, 22, step=3))
plt.title("Distribution of Property Prices")
plt.xlabel("Price ($)")
plt.ylabel("Count")

# Add annotations for the bars and the KDE line
plt.text(0.05, 0.9, "Bars: Frequency of Prices", transform=plt.gca().transAxes, fontsize=12, ha='left')

# Show the plot
plt.show()

In [541]:
import pandas as pd
import matplotlib.pyplot as plt


avg_price_bedrooms = df.groupby('Bedrooms')['Price'].mean()
avg_price_bathrooms = df.groupby('Bathrooms')['Price'].mean()
avg_price_parking = df.groupby('Parking')['Price'].mean()

df_combined = pd.DataFrame({
    'Bedrooms': avg_price_bedrooms,
    'Bathrooms': avg_price_bathrooms,
    'Parking': avg_price_parking
})

df_combined.plot(kind='bar', figsize=(10, 6), color=['LightSkyBlue', 'DodgerBlue', 'RoyalBlue'])

plt.title('Average Property Price by Bedrooms, Bathrooms, and Parking Availability')
plt.xlabel('Categories')
plt.ylabel('Average Property Price')
plt.xticks(rotation=0)
plt.grid(True)
plt.legend(title='Features', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()


In [496]:
plt.figure(figsize=(8, 6))
sns.lineplot(x="Size(sqft)", y="Price", data=df, markers="o", color="blue")

# Adding title and labels
plt.title("Property Price vs. Size (sqft)")
plt.xlabel("Size (sqft)")
plt.ylabel("Price ($)")

# Adding labels to each point with adjusted placement
for i in range(df.shape[0]):
    # Adjusting the text position by offsetting it slightly
    plt.text(df["Size(sqft)"].iloc[i], df["Price"].iloc[i] + 500, f'{df["Price"].iloc[i]:,.0f}', 
             fontsize=9, color='black', ha='center', va='bottom')

plt.show()


In [494]:
plt.figure(figsize=(10, 6))

# Creating the horizontal bar plot
sns.barplot(x="Price", y="Distance", data=df, palette="Set2", orient='h')

# Adding title and labels
plt.title("Property Price vs. Distance from Lakehead University")
plt.xlabel("Price ($)")
plt.ylabel("Distance (km)")

# Display the plot
plt.show()


In [545]:
plt.figure(figsize=(8, 5))
unit_counts = df["Unit_type"].value_counts()
sns.barplot(x=unit_counts.index, y=unit_counts.values, palette="Set2")
plt.xticks(rotation=45)
plt.title("Distribution of Property Unit Types")
plt.xlabel("Unit Type")
plt.ylabel("Count")
plt.show()

In [547]:
# Calculate the count and the average price for each unit type
unit_counts = df["Unit_type"].value_counts()
avg_price = df.groupby("Unit_type")["Price"].mean()

# Create a plot with the count on the primary y-axis
fig, ax1 = plt.subplots(figsize=(8, 5))

# Plot the count of unit types
sns.barplot(x=unit_counts.index, y=unit_counts.values, palette="Set2", ax=ax1)
ax1.set_xlabel("Unit Type")
ax1.set_ylabel("Count")
ax1.set_title("Distribution of Property Unit Types and Average Price")

# Create a secondary axis for average price
ax2 = ax1.twinx()
ax2.plot(unit_counts.index, avg_price, color='red', marker='o', linestyle='-', linewidth=2, label='Average Price')
ax2.set_ylabel("Average Price ($)", color='red')
ax2.tick_params(axis='y', labelcolor='red')

# Show the plot
plt.xticks(rotation=45)
plt.show()
