# PART A: DATA VISUALISATION AND ANALYSIS

### 1. Check Python Environment 

In [6]:
# Check python environment 
import sys
import platform
import subprocess
import pkg_resources
import os

# Basic Python information
print(f"Python version: {platform.python_version()}")
print(f"Python implementation: {platform.python_implementation()}")
print(f"Python path: {sys.executable}")
print(f"Python location: {sys.prefix}")

# Operating system information
print(f"\nOS: {platform.system()} {platform.release()}")
print(f"Architecture: {platform.machine()}")

# Environment variables
print("\nRelevant environment variables:")
for var in ['PATH', 'PYTHONPATH', 'CONDA_PREFIX', 'VIRTUAL_ENV']:
    if var in os.environ:
        print(f"{var}: {os.environ[var]}")

# Check if running in Conda environment
in_conda = os.environ.get('CONDA_PREFIX') is not None
print(f"\nRunning in Conda environment: {in_conda}")

if in_conda:
    # Get conda info
    print("\nConda information:")
    try:
        conda_info = subprocess.check_output(['conda', 'info'], text=True)
        print(conda_info)
    except:
        print("Could not retrieve conda info")

# List installed packages
print("\nInstalled packages:")
installed_packages = sorted([f"{pkg.key}=={pkg.version}" for pkg in pkg_resources.working_set])
for package in installed_packages[:10]:  # Show first 10 packages
    print(package)
print(f"... and {len(installed_packages) - 10} more packages")

# Check for virtual environment
venv = os.environ.get('VIRTUAL_ENV')
if venv:
    print(f"\nVirtual environment: {venv}")

In [None]:
# Python version: 3.12.4
# Python implementation: CPython
# Python path: C:\Users\haojun\anaconda3\python.exe
# Python location: C:\Users\haojun\anaconda3

# OS: Windows 11
# Architecture: AMD64

### 2.1 Data Scraping (Selenium)

In [None]:
# IMDb：Search the locations of famous film and television productions
https://www.imdb.com/

In [None]:
# “London, England” as searching Keyword
URL:https://www.imdb.com/find/?s=kw&q=London%2C%20England&ref_=nv_sr_sm
selenium - Data Scraping(XPath)

In [None]:
pip install selenium

In [None]:
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

chrome_options= Options()
chrome_options.add_argument("--disable-search-engine-choice-screen")
driver = webdriver.Chrome(options=chrome_options)


driver.get('https://google.com')
#Chrome opens with an additional alert message on top
#driver.quit()

In [None]:
print(driver.title)

In [None]:
print(driver.current_url)

In [None]:
driver.save_screenshot('screenshot.png')

In [None]:
#headless mode
options = webdriver.ChromeOptions()
options.add_argument("--headless=new")
options.add_argument("--disable-search-engine-choice-screen")

driver = webdriver.Chrome(options=options)

driver.get("https://www.nintendo.com/")

In [None]:
print(driver.current_url)

In [None]:
pip install requests pandas

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from time import sleep

# Configure Chrome options
chrome_options = Options()
chrome_options.add_argument("--disable-search-engine-choice-screen")
driver = webdriver.Chrome(options=chrome_options)

# IMDb Search result URL (including keywords London, England)
url = "https://www.imdb.com/search/title/?keywords=london-england&explore=keywords&sort=moviemeter,desc"
driver.get(url)

# Create a list to store data
titles_list = []
years_list = []
ratings_list = []
types_list = []
links_list = []
contents_list = []

# Defining data extraction functions
def extract_page_data(driver):
    # Get the title, year, rating, genre, link and synopsis of a movie or TV show
    titles = driver.find_elements(By.CSS_SELECTOR, ".lister-item-header a")
    years = driver.find_elements(By.CSS_SELECTOR, ".lister-item-year")
    ratings = driver.find_elements(By.CSS_SELECTOR, ".ratings-imdb-rating strong")
    types = driver.find_elements(By.CSS_SELECTOR, ".genre")
    contents = driver.find_elements(By.CSS_SELECTOR, ".ratings-bar + .text-muted")  # Select profile
    
    # Extract information one by one and store it in the corresponding list
    for i in range(len(titles)):
        titles_list.append(titles[i].text)
        years_list.append(years[i].text if i < len(years) else "N/A")  # Handling Possible Missing Years
        ratings_list.append(ratings[i].text if i < len(ratings) else "N/A")  # Dealing with possible missing ratings
        types_list.append(types[i].text.strip() if i < len(types) else "N/A")  # Remove leading and trailing spaces
        links_list.append(titles[i].get_attribute("href"))
        contents_list.append(contents[i].text.strip() if i < len(contents) else "N/A")  # Remove leading and trailing spaces

# Loop through the extracted data and click the "50 More" button
for page_num in range(3):  # Suppose you click the "50 More" button 3 times
    extract_page_data(driver)
    
    try:
        # Waiting for and locating the IMDb "50 More" button, using minimal XPath
        more_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//*[@id='__next']//button[contains('50 More')]"))
        )
        more_button.click()
        sleep(2)  # Waiting for new content to load
    except Exception as e:
        print(f"Unable to click the '50 More' button: {e}")
        break

# Close browser
driver.quit()

# Print results
print("Title list:", titles_list)
print("Year list:", years_list)
print("Rating list:", ratings_list)
print("Type list:", types_list)
print("Link list:", links_list)
print("Introduction list:", contents_list)

In [4]:
#Wikipedia：https://en.wikipedia.org/wiki/Category:Films_shot_in_London

In [8]:
#Combination with other resources 
manually or semi-automatically supplement information 
such as the title of the film and television work, appearing characters, shooting locations and plot summaries

In [10]:
# Use Python's geopy library to convert addresses to coordinates (latitude and longitude)
# Combine the Pandas library to handle Excel files
# Get the coordinates of the address and then save the result back to the Excel file

### 2.2 Data Scraping (Goodle Maps API)

In [23]:
# Google Maps API：https://developers.google.com/maps/documentation/places/web-service?hl=zh-cn
API KEY:AIzaSyDh3FxuW05_D002oA-rrY378pQ7BnaKbhY

In [None]:
pip install requests pandas openpyxl

In [None]:
import pandas as pd
import requests
import time

# Replace with your API key
API_KEY = 'AIzaSyDh3FxuW05_D002oA-rrY378pQ7BnaKbhY'

def get_coordinates(address):
    # Google Places API Geocoding URL
    url = f'https://maps.googleapis.com/maps/api/geocode/json?address={address}&key={API_KEY}'
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        if data['status'] == 'OK':
            latitude = data['results'][0]['geometry']['location']['lat']
            longitude = data['results'][0]['geometry']['location']['lng']
            return latitude, longitude
        else:
            print(f"Error: {data['status']} for address: {address}")
            return None, None
    else:
        print(f"HTTP Error: {response.status_code} for address: {address}")
        return None, None

# Read Excel file
df = pd.read_excel('places.xlsx')  # Replace with your Excel file name

# Assuming the address is in the 'Address' column
df['Latitude'] = None
df['Longitude'] = None

# Get coordinates
for index, row in df.iterrows():
    address = row['Address']  # Replace with your address column name
    latitude, longitude = get_coordinates(address)
    df.at[index, 'Latitude'] = latitude
    df.at[index, 'Longitude'] = longitude
    print(f"Processed: {address} -> ({latitude}, {longitude})")
    time.sleep(1)  # Add a delay to avoid too fast requests

# Save the results to a new Excel file
df.to_excel('output_file.xlsx', index=False)  # Replace with the output file name

### 2.3 Data Scraping (Goodle Places API)

In [None]:
pip install selenium

In [None]:
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

chrome_options= Options()
chrome_options.add_argument("--disable-search-engine-choice-screen")
driver = webdriver.Chrome(options=chrome_options)


driver.get('https://google.com')
#Chrome opens with an additional alert message on top
#driver.quit()

In [None]:
print(driver.title)

In [None]:
print(driver.current_url)

In [None]:
driver.save_screenshot('screenshot.png')

In [None]:
#headless mode
options = webdriver.ChromeOptions()
options.add_argument("--headless=new")
options.add_argument("--disable-search-engine-choice-screen")

driver = webdriver.Chrome(options=options)

driver.get("https://www.nintendo.com/")

In [None]:
print(driver.current_url)

In [None]:
pip install requests pandas

In [None]:
import requests
import pandas as pd
import time

# Your Google Places API key
API_KEY = 'AIzaSyDh3FxuW05_D002oA-rrY378pQ7BnaKbhY'

# London's coordinates (latitude, longitude)
location = '51.5074,-0.1278'  
radius = 1000000  # Search radius in meters
keywords = ['film production location', 'anime production location', 'game production location']

# Initialize a list to store search results
locations_data = []

# Google Places API base URL
base_url = 'https://maps.googleapis.com/maps/api/place/nearbysearch/json'

# Define search function
def search_places(keyword):
    params = {
        'location': location,
        'radius': radius,
        'keyword': keyword,
        'key': API_KEY
    }
    response = requests.get(base_url, params=params)
    results = response.json().get('results', [])
    
    # Processing the returned location data
    for place in results:
        place_info = {
            'Name': place.get('name'),
            'Address': place.get('vicinity'),
            'Latitude': place['geometry']['location']['lat'],
            'Longitude': place['geometry']['location']['lng'],
            'Type': keyword
        }
        locations_data.append(place_info)

# Use different keywords to search for places
for keyword in keywords:
    search_places(keyword)
    time.sleep(1)  # Delay to avoid triggering API rate limits

# Save the data to a CSV file
df = pd.DataFrame(locations_data)
df.to_csv('london_production_locations.csv', index=False, encoding='utf-8-sig')

print("Location information has been successfully exported to 'london_production_locations.csv'")

In [None]:
import pandas as pd

# Reading CSV Files
df = pd.read_csv('london_landmarks.csv')

# View the previous rows of data
print(df.head())

In [None]:
# View basic information about the data
print(df.info())

# View unique landmark types
print(df['Type'].unique())

# For example, filter out landmarks related to anime
anime_landmarks = df[df['Type'] == 'anime landmark']
print(anime_landmarks)

### 3. Data Analysing

In [28]:
# WGS 84（World Geodetic System 1984） - EPSG:27700/ British National Grid

In [None]:
import pandas as pd
from pyproj import Transformer

# Set up the converter
transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700")

# Read CSV file
file_path = r'E:\MyDocuments\UCL\RC14\TERM1\Tutorial\CINEMAP\7ghdatabase.csv'  # Modify the path according to your file location
output_file = r'E:\MyDocuments\UCL\RC14\TERM1\Tutorial\CINEMAP\8-3DVisualization.csv'  # Output file path

# Assume the CSV file has two columns: Latitude and Longitude
data = pd.read_csv(file_path)

# Check if necessary columns are included
if 'Latitude' in data.columns and 'Longitude' in data.columns:
    # Batch conversion
    def transform_coordinates(row):
        east, north = transformer.transform(row['Latitude'], row['Longitude'])
        return pd.Series({'East': east, 'North': north})
    
    # Apply the transformation and add the result to a new column
    data[['East', 'North']] = data.apply(transform_coordinates, axis=1)

    # Save to a new CSV file
    data.to_csv(output_file, index=False, encoding='utf-8-sig')
    print(f"Conversion complete! Results saved to {output_file}")
else:
    print("The input CSV file must contain 'Latitude' and 'Longitude' columns!")

### 3.1 LocationMap

In [31]:
# Get Postcode of Points
# Use Python's pandas library to add type,category, and area columns
Typemap
Categorymap
Areamap
# Symbology
# Use Python's pandas library to add a column and count the number of times each location is used in an Excel file
Location_Popularitymap
# Symbology
# Count the number of points in each lsoa according to the lsoa where the point is located
LSOA_Popularitymap
# Symbology

In [None]:
import pandas as pd

# read CSV
file_path1 = r'E:\MyDocuments\UCL\RC14\TERM1\Tutorial\CINEMAP\5losaborough_file.csv'  # Modify the path according to your file location
data1 = pd.read_csv(file_path1)

file_path2 = r'E:\MyDocuments\UCL\RC14\TERM1\Tutorial\CINEMAP\data set\公共交通可达性水平LSOA2011 AvPTAI2015.csv'  # Modify the path according to your file location
data2 = pd.read_csv(file_path2)

# Assume that the column name of LSOA code in both files is 'LSOA code'
# If the column name is different, you need to adjust it according to the actual situation
merged_data = pd.merge(data1, data2, on='LSOA code', how='inner')

# Export matching results to a new CSV file
merged_data.to_csv('matched_LSOA_data.csv', index=False)

In [None]:
pip install pandas

In [None]:
import pandas as pd

# Read Excel file
file_path = 'output_file.xlsx'  # Replace with your Excel file path
df = pd.read_excel(file_path)

# Create a new column 'Category' to store the classification results
def categorize_location(Name):
    # Keyword classification (can be adjusted as needed)
    if any(keyword in Name.lower() for keyword in ['tower', 'castle', 'palace', 'abbey']):
        return 'Historical Building'
    elif any(keyword in Name.lower() for keyword in ['park', 'garden', 'square']):
        return 'Park or Outdoor'
    elif any(keyword in Name.lower() for keyword in ['museum', 'gallery']):
        return 'Museum or Gallery'
    elif any(keyword in Name.lower() for keyword in ['street', 'road', 'avenue', 'square']):
        return 'Street or Square'
    elif any(keyword in Name.lower() for keyword in ['bridge', 'river']):
        return 'Bridge or River'
    elif any(keyword in Name.lower() for keyword in ['station', 'terminal']):
        return 'Station'
    elif any(keyword in Name.lower() for keyword in ['mall', 'market', 'shop', 'shopping', 'center']):
        return 'Commercial'
    elif any(keyword in Name.lower() for keyword in ['school', 'university', 'college', 'academy']):
        return 'School'
    elif any(keyword in Name.lower() for keyword in ['city hall', 'government', 'council', 'office']):
        return 'Government'
    else:
        return 'Other'

# Apply classification function
df['Category'] = df['Name'].apply(categorize_location)

# Save the classified data to a new Excel file
output_path = 'classified_locations.xlsx'
df.to_excel(output_path, index=False)

print(f"Classification completed, results saved to {output_path}")

In [None]:
import pandas as pd

# Read Excel file
file_path = 'classified_locations.xlsx'
df = pd.read_excel(file_path)

# View column names
print(df.columns)

In [None]:
import pandas as pd

# Load Excel file
file_path = 'classified_locations.xlsx'  # Replace with your file path
df = pd.read_excel(file_path)

# Define classification function
def classify_area(postcode):
    if postcode.startswith(('EC', 'WC', 'E1', 'W1', 'SW1')):
        return 'Central London'
    elif postcode.startswith('E'):
        return 'East London'
    elif postcode.startswith('W'):
        return 'West London'
    elif postcode.startswith('N'):
        return 'North London'
    elif postcode.startswith('SE'):
        return 'South London'
    else:
        return 'Unknown'

# Create a new categorical column
df['Area'] = df['Postcode'].apply(classify_area)

# Save the results to a new Excel file
output_file_path = 'classified_locations.xlsx'  # Output file path
df.to_excel(output_file_path, index=False)

print(f"Classification completed, results saved to {output_file_path}")

### 3.2 Heatmap

In [None]:
import pandas as pd

# Read Excel file
file_path = 'classified_locations.xlsx'  # Replace with your file path
df = pd.read_excel(file_path)

# Count the number of times each location was used
location_counts = df['Name'].value_counts()  # Assume 'Location' is the name of the column storing the location names

# Add the times to the original data frame
df['Usage Count'] = df['Name'].map(location_counts)

# Save the updated data frame back to the Excel file
output_file_path = 'updated_file.xlsx'  # Replace with the path of the file you want to save
df.to_excel(output_file_path, index=False)

### 3.3 Dynamic Heatmap

In [None]:
pip install folium

In [None]:
import folium

# Create a map center of London
london_map = folium.Map(location=[51.5074, -0.1278], zoom_start=12)
london_map

In [41]:
import folium

# Create a map center of London
london_map = folium.Map(location=[51.5074, -0.1278], zoom_start=12)

# Add a placemark location to the map
for _, row in df.iterrows():
    folium.Marker(
        location=[row['Latitude'], row['Longitude']],
        popup=f"{row['Name']} ({row['Type']})",
        tooltip=row['Address']
    ).add_to(london_map)

# Show map
london_map.save('london_landmarks_map.html')

In [None]:
# Save the filtered data as a new CSV file
anime_landmarks.to_csv('london_anime_landmarks.csv', index=False, encoding='utf-8-sig')
print("Data has been exported to 'london_anime_landmarks.csv'")

In [None]:
print(os.getcwd())  # Print the current working directory

In [None]:
import folium

# Create a map of London
london_map = folium.Map(location=[51.5074, -0.1278], zoom_start=12)
london_map  # Display maps directly in Notebook

### 3.4 Classification Scatter Plot

In [None]:
#Classification scatter plot: Use Seaborn's stripplot or swarmplot to show the distribution trend of different types of movies on landmarks
·Which types of movies prefer certain specific landmarks?
·Do different movie types have common landmark selection?

In [None]:
import pandas as pd

# Example of absolute path
file_path = r'E:\MyDocuments\UCL\RC14\TERM1\Tutorial\CINEMAP\4updated_file.csv'  # Modify the path according to your file location
data = pd.read_csv(file_path)
print(data.head())

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

# Make sure the chart is displayed in JupyterLab
%matplotlib inline

# Read CSV file
file_path = r'E:\MyDocuments\UCL\RC14\TERM1\Tutorial\CINEMAP\4updated_file.csv' # Modify the path according to your file location
data = pd.read_csv(file_path)

# Assuming each movie genre consists of multiple words separated by commas, first split these words into separate lines
data_expanded = data.set_index(['Name'])['Genre'].str.split(',', expand=True).stack().reset_index(level=1, drop=True).to_frame('Genre')

# Remove extra spaces
data_expanded['Genre'] = data_expanded['Genre'].str.strip()

# Count the number of times each landmark appears in each movie genre
landmark_counts = data_expanded.groupby(['Genre', 'Name']).size().reset_index(name='count')

# Select Color Map: Adjust colors based on number of occurrences
norm = plt.Normalize(landmark_counts['count'].min(), landmark_counts['count'].max()) # Create normalization
cmap = plt.cm.viridis # Use Viridis color map

# Create a drawing
plt.figure(figsize=(40, 40))

# Create axis object
ax = plt.gca()

# Use stripplot to plot the distribution trend, increase the size of the points, and change the color according to the number of occurrences
sns.stripplot(x='Genre', y='Name', data=landmark_counts, jitter=True, size=10, 
              hue='count', palette=cmap, ax=ax)

# Set the title and label to be larger and bolder
plt.title('Distribution of Movie Genres at Landmarks in London', fontsize=30, fontweight='bold', color='white')
plt.xlabel('Movie Genre', fontsize=20, fontweight='bold', color='white')
plt.ylabel('Landmark Name', fontsize=20, fontweight='bold', color='white')

# Set the axis label font size
plt.xticks(fontsize=15, fontweight='bold', color='white')
plt.yticks(fontsize=15, fontweight='bold', color='white')

# Create a ScalarMappable object and set the color bar
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])  # You must call set_array() to display the color bar.

# Specifying a colorbar using an ax object
fig = plt.gcf()
cbar = fig.colorbar(sm, ax=ax)
cbar.set_label('Landmark Frequency', fontsize=15, fontweight='bold', color='white')

# save image
plt.savefig(r'E:\MyDocuments\UCL\RC14\TERM1\Tutorial\CINEMAP\movie_genre_landmarks.png', 
            dpi=300,  # Set the resolution, 300 is high resolution
            bbox_inches='tight',  # Ensure that the chart content is displayed completely
            transparent=False)  # The background is transparent. If you want the background to be black, remove the transparent parameter.

# Show chart
plt.tight_layout()
plt.show()

### 3.5 Time Series Plot

In [None]:
# Time Series Plot
Use Matplotlib or Seaborn to plot a time series plot to show the frequency of landmark usage over time.
·Which landmarks are used most frequently over time?
·Is landmark selection related to urban cultural events at different historical stages?

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Read CSV file
file_path = r'E:\MyDocuments\UCL\RC14\TERM1\Tutorial\CINEMAP\4updated_file.csv' # Modify the path according to your file location
data = pd.read_csv(file_path)

# Make sure the 'Year' column is of integer type and the 'Name' column is of string type
data['Year'] = data['Year'].astype(int)
data['Name'] = data['Name'].astype(str)

# Calculate the frequency of each landmark in each year
frequency = data.groupby(['Year', 'Name']).size().reset_index(name='Frequency')

# Use Seaborn to draw a time series graph
plt.figure(figsize=(40, 30))
sns.lineplot(data=frequency, x='Year', y='Frequency', hue='Name', marker='o', linewidth=3, markersize=10)

# Add title and label
plt.title('Landmark Usage Frequency Over Time', fontsize=20, fontweight='bold', color='white')
plt.xlabel('Year', fontsize=15, fontweight='bold', color='white')
plt.ylabel('Frequency', fontsize=15, fontweight='bold', color='white')

# Display the chart
plt.legend(title='Landmarks', bbox_to_anchor=(1.05, 1), loc='upper left')

# Adjust the layout
plt.tight_layout()

# Save the chart
plt.savefig(r'E:\MyDocuments\UCL\RC14\TERM1\Tutorial\CINEMAP\landmark_usage_frequency.png', 
dpi=300, # Set the resolution, 300 is high resolution
bbox_inches='tight', # Ensure that the chart content is fully displayed
transparent=False) # Background is transparent. If you want the background to be black, remove the transparent parameter
# Display the chart
plt.show()

In [None]:
#1953 - Queen Elizabeth II's Coronation
Location: Westminster Abbey
#1960s - Swinging London Cultural Revolution
Location: Carnaby Street and King's Road
#1981 - Brixton Riots
Location: Brixton, Brixton High Street​
#2012 - London Olympics
Location: Olympic Park, Wembley Stadium, Earls Court, especially in Stratford​
#2019 - Extinction Rebellion Protests
Location: Parliament Square, Oxford Circus
#2023 - Chinese New Year Celebrations, London Games Festival, Notting Hill Carnival
Location: Chinatown in Soho (Chinese New Year), Royal Festival Hall and Southbank Centre (London Games Festival), Notting Hill (Carnival)

### 3.6 Time Series Plot

In [None]:
# Dynamic heat map
Use Folium or Geopandas to show the hottest shooting areas over time.

In [None]:
import pandas as pd
from folium.plugins import HeatMapWithTime
import folium

# Read data
file_path = r'E:\MyDocuments\UCL\RC14\TERM1\Tutorial\CINEMAP\4updated_file.csv' # Modify the path according to your file location
data = pd.read_csv(file_path)

# Make sure the data contains the required columns
if not {'Year', 'Name', 'Latitude', 'Longitude'}.issubset(data.columns):
raise ValueError("Data needs to contain 'Year', 'Name', 'Latitude', 'Longitude' columns")

# Group by year to generate heat map data
heat_data = []
years = sorted(data['Year'].unique()) # Get all years
for year in years:
year_data = data[data['Year'] == year]
heat_data.append(year_data[['Latitude', 'Longitude']].values.tolist())

# Create a Folium map object
m = folium.Map(location=[51.509865, -0.118092], zoom_start=11) # Based on the center of London

# Add a dynamic heat map
HeatMapWithTime(heat_data, radius=10, gradient={0.4: 'blue', 0.65: 'lime', 1: 'red'},
index=years, auto_play=True, max_opacity=0.8).add_to(m)

# Save as HTML file
m.save(r'E:\MyDocuments\UCL\RC14\TERM1\Tutorial\CINEMAP\dynamic_heatmap.html')
print("The dynamic heat map has been saved as dynamic_heatmap.html")

### 3.7 K-means Clustering Plot

In [None]:
# K-means clustering plot
Group landmarks based on popularity, type, or facilities.
·Does the distribution of landmarks show some kind of clustering pattern (such as proximity to the city center)?
·Is there a relationship between the popularity of landmarks and accessibility to transportation?

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Read data
file_path = r'E:\MyDocuments\UCL\RC14\TERM1\Tutorial\CINEMAP\4updated_file.csv' # Modify the path according to your file location
df = pd.read_csv(file_path)

# Check column names
print(df.columns)

# Standardize data
features = df[['Latitude', 'Longitude', 'Usage Count']]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Perform K-means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
df['Cluster'] = kmeans.fit_predict(scaled_features)

# Create a graph and set the background
plt.figure(figsize=(40, 40), facecolor='black')

# Draw the clustering results
scatter = plt.scatter(df['Longitude'], df['Latitude'], c=df['Cluster'], cmap='viridis', 
marker='o', s=200, edgecolor='none') # Set the point size to 50 and cancel the border

# Set the font and color
plt.title('K-means Clustering of Landmarks', fontsize=20, color='white')
plt.xlabel('Longitude', fontsize=15, color='white')
plt.ylabel('Latitude', fontsize=15, color='white')

# Set the scale axis
plt.xticks(fontsize=12, color='white')
plt.yticks(fontsize=12, color='white')

# Set the background and grid
plt.gca().set_facecolor('black') # Background black
plt.grid(True, color='white', linestyle='--', linewidth=0.5)

# Set the axis gradient color
plt.gca().spines['top'].set_color('white')
plt.gca().spines['right'].set_color('white')
plt.gca().spines['left'].set_color('white')
plt.gca().spines['bottom'].set_color('white')

# Display the color bar
cbar = plt.colorbar(scatter)
cbar.set_label('Cluster', fontsize=12, color='white')
cbar.ax.tick_params(labelsize=12, labelcolor='white')

# Adjust layout
plt.tight_layout()

# Save chart
plt.savefig(r'E:\MyDocuments\UCL\RC14\TERM1\Tutorial\CINEMAP\landmark_clusters.png', 
dpi=300, # Set resolution, 300 is high resolution
bbox_inches='tight', # Ensure that the chart content is fully displayed
transparent=False) # Background is transparent, if you want the background to be black, remove the transparent parameter

# Display graphics
plt.show()

### 3.8 Correlation Heat Map

In [None]:
# Correlation heat map
Discover the potential connection between landmark popularity and other factors (such as transportation, location, density, etc.), and provide decision support for related fields (such as urban planning, tourism, cultural industries, etc.).
·Relationship between landmark popularity and transportation accessibility: Landmarks located near transportation hubs may be more likely to attract tourists or film and television shooting.
·Relationship between landmark location and popularity: Landmarks in central areas are more popular than those in other areas.
·Relationship between transportation accessibility and regional area: Positive correlation, indicating that large areas may have better transportation networks, further supporting people to travel to these areas.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import chardet
file_path = r'E:\MyDocuments\UCL\RC14\TERM1\Tutorial\CINEMAP\5losa_file.csv' # Modify the path according to your file location

# Detect file encoding
with open(file_path, 'rb') as f:
result = chardet.detect(f.read())

print(result)

# Load the file using the detected encoding
data = pd.read_csv(file_path, encoding=result['encoding'])

# 2. Calculate the correlation matrix
# If there are non-numeric columns in the data, you can select the numeric columns first
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
correlation_matrix = data[numeric_columns].corr()

# Set the background and font color of Seaborn and matplotlib
sns.set(style='dark', rc={"axes.facecolor": "black", "axes.labelcolor": "white", "xtick.color": "white", "ytick.color": "white"})
plt.rcParams['figure.facecolor'] = 'black' # Set the background of the entire figure to black
plt.rcParams['axes.facecolor'] = 'black' # Set the background of the coordinate axis to black

# Draw a heat map
plt.figure(figsize=(40, 40)) # Adjust to a suitable size
heatmap = sns.heatmap(correlation_matrix, annot=True, cmap='twilight', fmt=".2f", vmin=-1, vmax=1,
annot_kws={"size": 25, "fontweight":'bold'}) # Set the number size and color

# Set the title and font color
plt.title("Correlation Heatmap", fontsize=40, color="white")

# Add a colorbar and set its style
cbar = heatmap.collections[0].colorbar
cbar.set_ticks([-1, 0, 1]) # Set the scale value of the color bar
cbar.ax.tick_params(labelsize=24, labelcolor="white") # Set the color bar font size and color

# Adjust the layout
plt.tight_layout()

# Save the chart
plt.savefig(r'E:\MyDocuments\UCL\RC14\TERM1\Tutorial\CINEMAP\correlation_heatmap.png', 
dpi=300, # Set the resolution, 300 is high resolution
bbox_inches='tight', # Ensure that the chart content is fully displayed
transparent=False) # Background is transparent, if you want the background to be black, remove the transparent parameter
plt.show()

# PART C: EXPERIMENTATION WITH MACHINE LEARNING

### 1. Cluster Analyses

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import os

# Set output path
output_dir = r"E:\MyDocuments\UCL\RC14\TERM1\Tutorial\CINEMAP\seaborn+matpotlib\Framing Analysis"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Global style settings
plt.rcParams['figure.figsize'] = (16.5, 11.7) # A3 horizontal version ratio
plt.rcParams['savefig.transparent'] = True
plt.rcParams['axes.facecolor'] = 'none' # Transparent coordinate system background
plt.rcParams['figure.facecolor'] = 'none' # Transparent background of the figure
plt.rcParams['font.size'] = 14 # Global font size (default scale)
plt.rcParams['axes.titleweight'] = 'bold' # Bold title
plt.rcParams['axes.labelweight'] = 'normal'
plt.rcParams['axes.labelsize'] = 16 # Font size of axis labels
plt.rcParams['axes.titlecolor'] = 'white' # Title text is white
plt.rcParams['xtick.color'] = 'white'
plt.rcParams['ytick.color'] = 'white'
plt.rcParams['axes.labelcolor'] = 'white' # Axis label color is white
plt.rcParams['text.color'] = 'white' # Text color is white
plt.rcParams['legend.edgecolor'] = 'white'
plt.rcParams['legend.fontsize'] = 14
plt.rcParams['axes.edgecolor'] = 'white' # Axis border is white

# Read data
file_path = r"E:\MyDocuments\UCL\RC14\TERM1\Tutorial\CINEMAP\seaborn+matpotlib\London_LSOA_Film_Analysis_Updated_Realistic.csv"
df = pd.read_csv(file_path)
numeric_df = df.drop(columns=["LSOA_Code"])

# Standardize data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(numeric_df)
scaled_df = pd.DataFrame(data_scaled, columns=numeric_df.columns)

# -----------------------
# Multiple PCA analysis examples
# We show two different PCA projections: PC1 vs PC2, PC1 vs PC3
# Finally assume that PC1 vs PC2 is the best solution
# -----------------------
pca_full = PCA(n_components=3)
pca_full.fit(scaled_df)
components = pca_full.transform(scaled_df)
pca_3df = pd.DataFrame(components, columns=["PC1","PC2","PC3"])

# PCA 1: PC1 vs PC2
plt.figure()
sns.scatterplot(x="PC1", y="PC2", data=pca_3df, alpha=0.7, palette="mako", edgecolor='none')
plt.title("PCA (PC1 vs PC2) - Chosen Projection", color='white', fontsize=20, fontweight='bold')
plt.xlabel("PC1", fontsize=16, color='white')
plt.ylabel("PC2", fontsize=16, color='white')
plt.grid(True, color='white', alpha=0.3)
# Mark the "best solution"
plt.text(0.7*max(pca_3df["PC1"]), 0.9*max(pca_3df["PC2"]), "Best PCA Projection",
         fontsize=16, color='white', fontweight='bold')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "pca_projection_pc1_pc2.png"), transparent=True, dpi=300)
plt.show()

# PCA 2: PC1 vs PC3 (for comparison)
plt.figure()
sns.scatterplot(x="PC1", y="PC3", data=pca_3df, alpha=0.7, palette="mako", edgecolor='none')
plt.title("PCA (PC1 vs PC3)", color='white', fontsize=20, fontweight='bold')
plt.xlabel("PC1", fontsize=16, color='white')
plt.ylabel("PC3", fontsize=16, color='white')
plt.grid(True, color='white', alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "pca_projection_pc1_pc3.png"), transparent=True, dpi=300)
plt.show()

# -----------------------
# Elbow Method to find the best number of clusters
# -----------------------
inertia = []
k_range = range(2, 7) # For example, try k=2 to k=6
for k in k_range:
km = KMeans(n_clusters=k, random_state=42)
km.fit(scaled_df)
inertia.append(km.inertia_)

plt.figure()
plt.plot(k_range, inertia, marker='o', color='white')
plt.title("Elbow Method for Optimal k", color='white', fontsize=20, fontweight='bold')
plt.xlabel("Number of Clusters (k)", fontsize=16, color='white')
plt.ylabel("Inertia", fontsize=16, color='white')
plt.grid(True, color='white', alpha=0.3)
plt.xticks(k_range)
# Assume that k=4 is the best by observing the elbow plot
plt.text(4, inertia[k_range.index(4)], "Chosen k=4", fontsize=14, color='white', fontweight='bold', 
horizontalalignment='center', verticalalignment='bottom', bbox=dict(facecolor='none', edgecolor='white'))
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "elbow_method_multiple_k.png"), transparent=True, dpi=300)
plt.show()

# -----------------------
# Multiple K-Means clustering display, comparison of k=3, k=4, k=5
# Final choice of k=4
# -----------------------
for k in [3,4,5]:
    kmeans = KMeans(n_clusters=k, random_state=42)
    clusters = kmeans.fit_predict(scaled_df)
    pca_3df["Cluster"] = clusters

    plt.figure()
    sns.scatterplot(x="PC1", y="PC2", hue="Cluster", data=pca_3df, alpha=0.7,
                    palette="mako", edgecolor='none', legend='full')
    if k == 4:
        title = f"K-Means Clustering (k={k}) - Chosen Clusters"
    else:
        title = f"K-Means Clustering (k={k})"
    plt.title(title, color='white', fontsize=20, fontweight='bold')
    plt.xlabel("PC1", fontsize=16, color='white')
    plt.ylabel("PC2", fontsize=16, color='white')
    leg = plt.legend(title="Cluster", facecolor='none', edgecolor='white', labelcolor='white', title_fontsize=16)
    for text in leg.get_texts():
        text.set_color("white")
    plt.grid(True, color='white', alpha=0.3)

# If k=4, mark it as the final choice in the figure
if k == 4:
plt.text(0.8*max(pca_3df["PC1"]), 0.7*max(pca_3df["PC2"]), "Best K=4 Chosen",
fontsize=16, color='white', fontweight='bold')

plt.tight_layout()
plt.savefig(os.path.join(output_dir, f"kmeans_k{k}.png"), transparent=True, dpi=300)
plt.show()

# -----------------------
# Correlation heat map (unchanged)
# -----------------------
corr = numeric_df.corr()
plt.figure()
sns.heatmap(corr, annot=True, fmt=".2f", cmap="mako", cbar=True, square=True,
            linewidths=0.5, linecolor='white', annot_kws={"color":"white"}, vmin=-1, vmax=1)
plt.title("Correlation Heatmap of LSOA Data Features", color='white', fontsize=20, fontweight='bold')
cbar = plt.gca().collections[0].colorbar
cbar.ax.yaxis.set_tick_params(color='white')
plt.setp(cbar.ax.get_yticklabels(), color='white')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "correlation_heatmap.png"), transparent=True, dpi=300)
plt.show()

print("Analysis completed, all images saved and displayed in Notebook.")

### 2. PIX2PIX Analysis

In [None]:
import sys

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models # add models to the list
from torchvision.utils import make_grid
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")


import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm
from torchvision.utils import save_image

from PIL import Image

In [None]:
torch.manual_seed(180)

In [None]:
print(torch.__version__)

In [None]:
# CLASSES FOR NN
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)
TRAIN_DIR = "data/train"
VAL_DIR = "data/val"
LEARNING_RATE = 2e-4
BATCH_SIZE = 17 #Use a number that can divide the total of photos- (here 51/17=3)
NUM_WORKERS = 2
IMAGE_SIZE = 256
CHANNELS_IMG = 3
L1_LAMBDA = 100
LAMBDA_GP = 10
NUM_EPOCHS = 20  #we suggest to start with 5
LOAD_MODEL = False #True if you already have some trained weights
SAVE_MODEL = True
CHECKPOINT_DISC = "disc.pth.tar"
CHECKPOINT_GEN = "gen.pth.tar"

In [None]:
#Discriminator model for Pix2Pix

class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride):
        super(CNNBlock, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, 4, stride, 1, bias=False, padding_mode="reflect"),
            nn.BatchNorm2d(out_channels),
            nn.LeakyReLU(0.2),
        )

    def forward(self, x):
        return self.conv(x)


class Discriminator(nn.Module):
    def __init__(self, in_channels=3, features=[64, 128, 256, 512]):
        super().__init__()
        self.initial = nn.Sequential(
            nn.Conv2d(in_channels * 2,features[0],kernel_size=4,stride=2,padding=1,padding_mode="reflect"),
            nn.LeakyReLU(0.2),
        )

        layers = []
        in_channels = features[0]
        for feature in features[1:]:
            layers.append(
                CNNBlock(in_channels, feature, stride=1 if feature == features[-1] else 2),
            )
            in_channels = feature

        layers.append(
            nn.Conv2d(
                in_channels, 1, kernel_size=4, stride=1, padding=1, padding_mode="reflect"
            ),
        )

        self.model = nn.Sequential(*layers)

    def forward(self, x, y):
        x = torch.cat([x, y], dim=1)
        x = self.initial(x)
        return self.model(x)

In [None]:
#Generator model for Pix2Pix

class Block(nn.Module):
    def __init__(self, in_channels, out_channels, down=True, act="relu", use_dropout=False):
        super(Block, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, 4, 2, 1, bias=False, padding_mode="reflect")
            if down
            else nn.ConvTranspose2d(in_channels, out_channels, 4, 2, 1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU() if act == "relu" else nn.LeakyReLU(0.2),
        )

        self.use_dropout = use_dropout
        self.dropout = nn.Dropout(0.5)
        self.down = down

    def forward(self, x):
        x = self.conv(x)
        return self.dropout(x) if self.use_dropout else x


class Generator(nn.Module):
    def __init__(self, in_channels=3, features=64):
        super().__init__()
        self.initial_down = nn.Sequential(
            nn.Conv2d(in_channels, features, 4, 2, 1, padding_mode="reflect"),
            nn.LeakyReLU(0.2),
        )
        self.down1 = Block(features, features * 2, down=True, act="leaky", use_dropout=False)
        self.down2 = Block(features * 2, features * 4, down=True, act="leaky", use_dropout=False)
        self.down3 = Block(features * 4, features * 8, down=True, act="leaky", use_dropout=False)
        self.down4 = Block(features * 8, features * 8, down=True, act="leaky", use_dropout=False)
        self.down5 = Block(features * 8, features * 8, down=True, act="leaky", use_dropout=False)
        self.down6 = Block(features * 8, features * 8, down=True, act="leaky", use_dropout=False)
        self.bottleneck = nn.Sequential(
            nn.Conv2d(features * 8, features * 8, 4, 2, 1,padding_mode="reflect"), nn.ReLU()
        )

        self.up1 = Block(features * 8, features * 8, down=False, act="relu", use_dropout=True)
        self.up2 = Block(features * 8 * 2, features * 8, down=False, act="relu", use_dropout=True)
        self.up3 = Block(features * 8 * 2, features * 8, down=False, act="relu", use_dropout=True)
        self.up4 = Block(features * 8 * 2, features * 8, down=False, act="relu", use_dropout=False)
        self.up5 = Block(features * 8 * 2, features * 4, down=False, act="relu", use_dropout=False)
        self.up6 = Block(features * 4 * 2, features * 2, down=False, act="relu", use_dropout=False)
        self.up7 = Block(features * 2 * 2, features, down=False, act="relu", use_dropout=False)
        self.final_up = nn.Sequential(
            nn.ConvTranspose2d(features * 2, in_channels, kernel_size=4, stride=2, padding=1),
            nn.Tanh(),
        )

    def forward(self, x):
        d1 = self.initial_down(x)
        d2 = self.down1(d1)
        d3 = self.down2(d2)
        d4 = self.down3(d3)
        d5 = self.down4(d4)
        d6 = self.down5(d5)
        d7 = self.down6(d6)
        bottleneck = self.bottleneck(d7)
        up1 = self.up1(bottleneck)
        up2 = self.up2(torch.cat([up1, d7], 1))
        up3 = self.up3(torch.cat([up2, d6], 1))
        up4 = self.up4(torch.cat([up3, d5], 1))
        up5 = self.up5(torch.cat([up4, d4], 1))
        up6 = self.up6(torch.cat([up5, d3], 1))
        up7 = self.up7(torch.cat([up6, d2], 1))
        return self.final_up(torch.cat([up7, d1], 1))

In [None]:
#Train function for Pix2Pix
torch.backends.cudnn.benchmark = True

def train_fn(disc, gen, loader, opt_disc, opt_gen, l1_loss, bce, g_scaler, d_scaler,):
    loop = tqdm(loader, leave=True)

    for idx, (x, y) in enumerate(loop):
        x = x.to(DEVICE)
        y = y.to(DEVICE)

        # Train Discriminator
        with torch.cuda.amp.autocast():
            y_fake = gen(x)
            D_real = disc(x, y)
            D_fake = disc(x, y_fake.detach())
            D_real_loss = bce(D_real, torch.ones_like(D_real))
            D_fake_loss = bce(D_fake, torch.zeros_like(D_fake))
            D_loss = (D_real_loss + D_fake_loss) / 2

        disc.zero_grad()
        d_scaler.scale(D_loss).backward()
        d_scaler.step(opt_disc)
        d_scaler.update()

        # Train generator
        with torch.cuda.amp.autocast():
            D_fake = disc(x, y_fake)
            G_fake_loss = bce(D_fake, torch.ones_like(D_fake))
            L1 = l1_loss(y_fake, y) * L1_LAMBDA
            G_loss = G_fake_loss + L1

        opt_gen.zero_grad()
        g_scaler.scale(G_loss).backward()
        g_scaler.step(opt_gen)
        g_scaler.update()

        if idx % 10 == 0:
            loop.set_postfix(
                D_real=torch.sigmoid(D_real).mean().item(),
                D_fake=torch.sigmoid(D_fake).mean().item(),
            )

In [None]:
# Utils for Pix2Pix

def save_some_examples(gen, val_loader, epoch, folder, iteration):
  ##ADDITION
    y_pred = []
  ##
    
    x, y = next(iter(val_loader))
    x, y = x.to(DEVICE), y.to(DEVICE)
    gen.eval()
    with torch.no_grad():
        y_fake = gen(x)
        y_fake = y_fake * 0.5 + 0.5  # remove normalization
        #print(y_fake.shape)
        
        ##ADDITION
        y_prediction = y_fake.cpu().numpy() #convert pytorch tensor to numpy
        y_pred.append(y_prediction)
        ##
        
        save_image(y_fake, folder + f"/y_gen_{epoch}_{iteration}.png")
        save_image(x * 0.5 + 0.5, folder + f"/input_{epoch}_{iteration}.png")
        if epoch == 1:
            save_image(y * 0.5 + 0.5, folder + f"/label_{epoch}_{iteration}.png")
    gen.train()

    ##ADDITION
    ypred = np.array(y_pred)
    #print(ypred)
    np.save(f"ypred_{iteration}.npy", ypred)
    ##This addition saves the results as numpy arrays. Doing so, you can load them again and change the colors f.e.

def save_checkpoint(model, optimizer, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    checkpoint = {
        "state_dict": model.state_dict(),
        "optimizer": optimizer.state_dict(),
    }
    torch.save(checkpoint, filename)


def load_checkpoint(checkpoint_file, model, optimizer, lr):
    print("=> Loading checkpoint")
    checkpoint = torch.load(checkpoint_file, map_location=DEVICE)
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

    # If we don't do this then it will just have learning rate of old checkpoint
    # and it will lead to many hours of debugging 
    for param_group in optimizer.param_groups:
        param_group["lr"] = lr

### 3. Supersampling

### 3.1 Train Data

In [None]:
Using device: cpu
2.6.0
<torch._C.Generator at 0x12fdc46f0>
Index(['id', 'Building Height', 'Car Availability', 'Green Space',
       'Population Density', 'Crime Rate', 'Road Casualties',
       'Transportation Acessibility', 'Rating', 'Sentiment', 'Subtitle',
       'Overlap'],
      dtype='object')
(250000, 4)
(4, 500, 500)
(250000, 2)
(2, 500, 500)

In [None]:
# Repeat the process for labels 

# Transform to numpy
data=np.array(df[['id','Subtitle']])
print(data.shape)

# Transpose to have first the channels
data=np.transpose(data)

# Reshape to (2,height,width)
data= data.reshape(2,500,500)
print(data.shape)

# Divide into set
ylabel=np.empty((0,1,256,256))

for j in range(0,w-s,step):
    for i in range(0,h-s,step):
        # take slice from data of size [s x s] 
        sample = data[1:,i:i+s,j:j+s]
        ylabel=np.append(ylabel,[sample],axis=0)

# print(ylabel.shape)

# Normalize
ylabel[:,0,:,:] = ylabel[:,0,:,:]/np.max(ylabel[:,0,:,:]) 

In [None]:
def split_image(image, tile_size=256, stride=256):
h, w = image.shape
tiles = []
positions = [] # record the position of each tile

# make sure to cover the entire image, including the edges
for i in range(0, h-tile_size+1, stride):
for j in range(0, w-tile_size+1, stride):
# extract tile
tile = image[i:i+tile_size, j:j+tile_size]
tiles.append(tile)
positions.append((i, j))
            
    # Process the right edge
    if w % tile_size != 0:
        for i in range(0, h-tile_size+1, stride):
            j = w - tile_size # Take the tile_size size from the right edge to the left
            tile = image[i:i+tile_size, j:j+tile_size]
            tiles.append(tile)
            positions.append((i, j))
    
    # Process the bottom edge
    if h % tile_size != 0:
        for j in range(0, w-tile_size+1, stride):
            i = h - tile_size # Take the tile_size size from the bottom edge upwards
            tile = image[i:i+tile_size, j:j+tile_size]
            tiles.append(tile)
            positions.append((i, j))
            
    # Process the lower right corner
    if h % tile_size != 0 and w % tile_size != 0:
        tile = image[h-tile_size:h, w-tile_size:w]
        tiles.append(tile)
        positions.append((h-tile_size, w-tile_size))
    
    return np.array(tiles), positions

# Use this function to process your data
image_tiles, positions = split_image(df['id'].values.reshape(500, 500))

# Make sure the data range is correct
print("Original image size:", df['id'].values.reshape(500, 500).shape)
print("Number of tiles after splitting:", len(image_tiles))
print("Tile size:", image_tiles[0].shape)

# Check the range of the data
print("Original data range:", np.min(df['id'].values), "-", np.max(df['id'].values))
print("Segmented data range:", np.min(image_tiles), "-", np.max(image_tiles))

In [None]:
import seaborn as sns

# Assuming dataclipNew is the normalized version of dataclip
dataclipNew = dataclip.copy()

# Normalize R
dataclipNew[:, 0, :, :] = dataclipNew[:, 0, :, :] / np.max(dataclipNew[:, 0, :, :])

# Normalize G
dataclipNew[:, 1, :, :] = dataclipNew[:, 1, :, :] / np.max(dataclipNew[:, 1, :, :])

# Normalize B
dataclipNew[:, 2, :, :] = dataclipNew[:, 2, :, :] / np.max(dataclipNew[:, 2, :, :])

# Assuming ylabelNew is a processed version of ylabel
ylabelNew = ylabel.copy()

# Reconstructing the whole image from data
ylabelReconstruct = np.zeros((1, h, w))
ylabelIterator = 0
for j in range(0, w-s, step):
    for i in range(0, h-s, step):
        ylabelReconstruct[0, i:i+s, j:j+s] = ylabel[ylabelIterator, 0, :, :]
        ylabelIterator += 1

# Plot array as image with plt.imshow(array, cmap)
# Gray is 0--255, RGB is 0-1
i = 0

fig = plt.figure()
plt.imshow(np.transpose(ylabelReconstruct[0, :, :]) * 255, cmap='gray')
plt.show()

fig = plt.figure()
plt.imshow(np.transpose(ylabel[i, 0, :, :]) * 255, cmap='gray')
plt.show()

fig = plt.figure()
plt.imshow(np.transpose(ylabelNew[i, 0, :, :]) * 255, cmap='gray')
plt.show()

fig = plt.figure()
plt.imshow(np.transpose(dataclipNew[i, 0, :, :]), cmap=sns.color_palette("", as_cmap=True))
plt.show()

# Check shape of X data train
print(dataclipNew.shape)

# Recreating 3 channels in total
rgb = np.hstack((ylabelNew, ylabelNew, ylabelNew))
print(rgb.shape)

### 3.2 Test Data

In [None]:
# import data
dft = pd.read_csv('/Users/Jun Hao/UCL/python/5_Pix2Pix/Test_2.csv')

# print(dft.columns)

# Transform to numpy
datatest=np.array(dft[['id','Sentiment', 'Green Space', 'Transportation Acessibility']])
# print(datatest.shape)

datatest=np.transpose(datatest)

datatest= datatest.reshape(4,300,300)
# print(datatest.shape)

# Divide into set
w=300 #width
h=300 #height
s=256 #size
step=200 #step

datacliptest=np.empty((0,3,256,256))

idvaltest=np.empty((0,256,256))

for j in range(0,w-s,step):
    for i in range(0,h-s,step):
        #take slice from data of size [s x s] 
        sample = datatest[1:,i:i+s,j:j+s]
        sampleID = datatest[0,i:i+s,j:j+s]

        datacliptest=np.append(datacliptest,[sample],axis=0)
        idvaltest=np.append(idvaltest,[sampleID],axis=0)


# print(datacliptest.shape)
    
# Normalize the data

# Normalize R
datacliptest[:,0,:,:] = datacliptest[:,0,:,:]/np.max(datacliptest[:,0,:,:]) 

# Normalize G
datacliptest[:,1,:,:] = datacliptest[:,1,:,:]/np.max(datacliptest[:,1,:,:]) 

# Normalize B
datacliptest[:,2,:,:] = datacliptest[:,2,:,:]/np.max(datacliptest[:,2,:,:]) 

### 3.3 Train the Model

In [None]:
# Configurations
disc = Discriminator(in_channels=3).to(DEVICE)
gen = Generator(in_channels=3, features=64).to(DEVICE)
opt_disc = optim.Adam(disc.parameters(), lr=LEARNING_RATE, betas=(0.5, 0.999),)
opt_gen = optim.Adam(gen.parameters(), lr=LEARNING_RATE, betas=(0.5, 0.999))
BCE = nn.BCEWithLogitsLoss()
L1_LOSS = nn.L1Loss()

if LOAD_MODEL:
   load_checkpoint(
       CHECKPOINT_GEN, gen, opt_gen, LEARNING_RATE,
   )
   load_checkpoint(
       CHECKPOINT_DISC, disc, opt_disc, LEARNING_RATE,
   )
   
#Train the model
from torch.utils.data import TensorDataset

##ADDITION
tensor_x = torch.Tensor(dataclipNew) # transform to torch tensor
tensor_y = torch.Tensor(rgb)

train_dataset = TensorDataset(tensor_x,tensor_y) # create your datset
##

train_loader = DataLoader(train_dataset,batch_size=BATCH_SIZE,shuffle=True,num_workers=NUM_WORKERS,)

g_scaler = torch.cuda.amp.GradScaler()
d_scaler = torch.cuda.amp.GradScaler()

NUM_EPOCHS = 70

for epoch in range(NUM_EPOCHS):
    print('epoch is',epoch)
    train_fn(disc, gen, train_loader, opt_disc, opt_gen, L1_LOSS, BCE, g_scaler, d_scaler)
    
    if SAVE_MODEL and epoch % 5 == 0:
        save_checkpoint(gen, opt_gen, filename = CHECKPOINT_GEN)
        save_checkpoint(disc, opt_disc, filename = CHECKPOINT_DISC)

## ADDITION
save_checkpoint(gen, opt_gen, filename = CHECKPOINT_GEN)
save_checkpoint(disc, opt_disc, filename = CHECKPOINT_DISC)
##

# Results: Ideally we need D_fake=0 and D_real=1

### 3.4 Test the Model

In [None]:
# TEST each image individually
for i in range(len(datacliptest)):
    
    tensor_x = torch.Tensor(datacliptest[i].reshape(1,3,256,256)) # transform to torch tensor
    
    #We can use a fake test target/ we only need it as placeholder
    testRandomY = np.empty_like(datacliptest[i].reshape(1,3,256,256))
    tensor_y = torch.Tensor(testRandomY)
    
    val_dataset = TensorDataset(tensor_x,tensor_y) # create your datset
    
    val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)
    
    save_some_examples(gen, val_loader, 52, folder="evaluation", iteration=i) #Remember to choose the epoch

### 3.5 Save the Results (csv)

In [None]:
nSamples = len(datacliptest)
yFull = np.zeros((nSamples, 3, 256, 256))

for i in range(len(datacliptest)): 
    yFull[i] = np.load(f"ypred_{i}.npy")


## Visualising the image
# Reconstructing the whole image from the NN results
yReconstruct = np.zeros((3, 300, 300))  # Adjusted to match the dimensions of the test data
idReconstruct = np.zeros((300, 300))    # Adjusted to match the dimensions of the test data
ylabelIterator = 0
for j in range(0, 300-s, step):  # Adjusted to match the dimensions of the test data
    for i in range(0, 300-s, step):  # Adjusted to match the dimensions of the test data
        yReconstruct[:, i:i+s, j:j+s] = yFull[ylabelIterator, :, :, :]
        idReconstruct[i:i+s, j:j+s] = idvaltest[ylabelIterator, :, :]
        ylabelIterator += 1

fig = plt.figure()
plt.imshow(np.transpose(yReconstruct, (1, 2, 0)), cmap='BrBG_r')
plt.show()

# Reshape multidimensional array to get it as list
yReconstruct = yReconstruct.reshape(1, 3, -1)
idReconstruct = idReconstruct.reshape(-1)

# Construct a dataframe from the NN outputs and save as csv
dfy = pd.DataFrame(idReconstruct[:], columns=['id'])
dfy['ch1'] = yReconstruct[0, 0, :]
dfy['ch2'] = yReconstruct[0, 1, :]
dfy['ch3'] = yReconstruct[0, 2, :]
dfy.to_csv(f'./_pred_LG_52.csv', index=False)