In [24]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import json
import os
from io import StringIO
from constants import au_postcodes_df
from pydantic import BaseModel, ValidationError
from typing import Optional
from fuzzywuzzy import process
from fuzzywuzzy import fuzz

In [4]:
school_types = ['primary', 'secondary']
states = list(au_postcodes_df['state'].unique())

# create an empty DataFrame to store the data
schools_with_score_df = pd.DataFrame(columns=['school', 'suburb', 'state', 'postcode', 'score', 'school_type'])

for school_type in school_types:
    for state in states:

        print(f"Start scraping  the webpage: {school_type} {state}")

        url = f'https://bettereducation.com.au/school/{school_type}/{state}/{state}_top_{school_type}_schools.aspx'
        response = requests.get(url)

        if response.status_code == 200:
            # Parse the HTML content
            soup = BeautifulSoup(response.text, 'html.parser')
            table = soup.find('table', {'id': 'ctl00_ContentPlaceHolder1_GridView1'})
            table_io = StringIO(str(table))

            # Convert the table to a DataFrame
            df = pd.read_html(table_io)[0]
            
            # Clean the DataFrame
            if state == 'VIC':
                df = df.rename(columns={'Postcode': 'postcode'})
                df['suburb'] = df['School'].str.split(',', expand=True).iloc[:, -3:-2]
            if state == 'ACT' or state == 'TAS':
                df[['suburb', 'state', 'postcode']] = df['School'].str.split(',', expand=True).iloc[:, -3:]
            if state == 'QLD':
                df['postcode'] = ''
                df["suburb"] = df['Locality']
            if state == 'SA' or state == 'NT' or state == 'NSW':
                df[['suburb', 'state', 'postcode']] = df['Locality'].str.split(',', expand=True).iloc[:, 0:3]
            if state == 'WA':
                if school_type == 'primary':
                    df[['suburb', 'state', 'postcode']] = df['Locality'].str.split(',', expand=True).iloc[:, 0:3]
                if school_type == 'secondary':
                    df = df.rename(columns={'Postcode': 'postcode'})
                    df['suburb'] = df['School'].str.split(',', expand=True).iloc[:, -3:-2]
            
            #handle special cases
            if school_type == 'primary':
                if state == 'NSW':
                    df.loc[df['School'] == 'Redlands,Cremorne,NSW,2090', ['suburb', 'state', 'postcode']] = ['Cremorne', 'NSW', '2090']
                if state == 'NT':
                    df.loc[df['Locality'] == 'Berrimah,NT,828', 'postcode'] = '0828'
            if school_type == 'secondary':
                if state == 'NSW':
                    df.loc[df['Locality'] == 'Surry Hills NSW 2010', ['suburb', 'state', 'postcode']] = ['Surry Hill', 'NSW', '2010']
                    df.loc[df['Locality'] == 'St Ives,St Ives,NSW,2075', ['suburb', 'state', 'postcode']] = ['St Ives', 'NSW', '2075']
                    df.loc[df['School'] == 'Redlands,Cremorne,NSW,2090', ['suburb', 'state', 'postcode']] = ['Cremorne', 'NSW', '2090']


            df['School'] = df['School'].str.split(',', expand=True).iloc[:, 0]
            df["state"] = state 
            df = df[['School', 'suburb', 'state', 'postcode', 'State Overall Score']]
            df = df.rename(columns={'School': 'school', 'State Overall Score': 'score'})
            df['educationLevel'] = school_type

            # assign the dataframe name df_school_type
            schools_with_score_df = pd.concat([schools_with_score_df, df], ignore_index=True)
            
            print(f"Successfully retrieved the webpage: {school_type} {state}")
        else:
            print(f"Failed to retrieve the webpage: status code {response.status_code}")


Start scraping  the webpage: primary NT


Successfully retrieved the webpage: primary NT
Start scraping  the webpage: primary NSW
Successfully retrieved the webpage: primary NSW
Start scraping  the webpage: primary ACT
Successfully retrieved the webpage: primary ACT
Start scraping  the webpage: primary VIC
Successfully retrieved the webpage: primary VIC
Start scraping  the webpage: primary QLD
Successfully retrieved the webpage: primary QLD
Start scraping  the webpage: primary SA
Successfully retrieved the webpage: primary SA
Start scraping  the webpage: primary WA
Successfully retrieved the webpage: primary WA
Start scraping  the webpage: primary TAS
Successfully retrieved the webpage: primary TAS
Start scraping  the webpage: secondary NT
Successfully retrieved the webpage: secondary NT
Start scraping  the webpage: secondary NSW
Successfully retrieved the webpage: secondary NSW
Start scraping  the webpage: secondary ACT
Successfully retrieved the webpage: secondary ACT
Start scraping  the webpage: secondary VIC
Successfully r

In [6]:
schools_with_score_df

Unnamed: 0,school,suburb,state,postcode,score,school_type,educationLevel
0,Haileybury Rendall School,Berrimah,NT,0828,100,,primary
1,The Essington School Darwin,Nightcliff,NT,0810,100,,primary
2,Nhulunbuy Christian School,Nhulunbuy,NT,0880,99,,primary
3,Katherine School Of The Air,Katherine,NT,0850,99,,primary
4,Milkwood Steiner School,Berrimah,NT,0828,99,,primary
...,...,...,...,...,...,...,...
2247,Riverside High School,Riverside,TAS,7250,92,,secondary
2248,St Brendan-Shaw College,Devonport,TAS,7310,91,,secondary
2249,Peregrine,Nicholls Rivule,TAS,7112,91,,secondary
2250,Clarence High School,Bellerive,TAS,7018,90,,secondary


In [7]:
# default header for the request
def header(url):
    headers = {
        'Accept': 'application/json',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8,zh-CN;q=0.7,zh;q=0.6',
        'Cache-Control': 'max-age=0',
        'Referer': url,
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    }
    return headers

In [8]:
class SchoolBase(BaseModel):
    school: str
    suburb: str
    state: str
    postcode: str
    schoolType: str
    educationLevel: str
    score: Optional[int] = None

In [9]:
def extract_school_from_suburb_profile():
    directory = os.path.join("D:\\aus_real_estate_data", 'suburb-profile')
    states = os.listdir(directory)
    url = 'http://localhost:8000/schools' # backend url to post data

    schools_df = pd.DataFrame(columns=['school', 'suburb', 'state', 'postcode', 'schoolType', 'educationLevel', 'score'])

    for state_code in states:
        d2 = os.path.join(directory, state_code)
        suburbs = os.listdir(d2)
        
        for suburb in suburbs:
            d3 = os.path.join(d2, suburb)
            json_files = os.listdir(d3)

            # Add missing import for json module

            for json_f in json_files:
                with open(os.path.join(d3, json_f)) as f:
                    suburb_profile = json.load(f)

                try:
                    schools = suburb_profile['props']['pageProps']['details'].get('schoolCatchment', {}).get('schools')
                    # Add a check for the existence of the 'schools' key before accessing it
                    if schools:
                        for school in schools:
                            school_data = SchoolBase(
                                school=school['name'],
                                suburb=" ".join(suburb.split('-')[0:-1]),
                                state=state_code,
                                postcode=str(suburb.split('-')[-1]),
                                schoolType=school['type'],
                                educationLevel=school['educationLevel'],
                                score=None
                            )

                            school_data = school_data.model_dump()
                            if school_data['educationLevel'] == 'combined':
                                school_data['educationLevel'] = 'primary'
                                schools_df = pd.concat([schools_df, pd.DataFrame([school_data])], ignore_index=True)
                                school_data['educationLevel'] = 'secondary'
                                schools_df = pd.concat([schools_df, pd.DataFrame([school_data])], ignore_index=True)
                            else:
                                schools_df = pd.concat([schools_df, pd.DataFrame([school_data])], ignore_index=True)
                except KeyError as e:
                    continue
        
    return schools_df

In [10]:
schools_df = extract_school_from_suburb_profile()


In [13]:
schools_df.sort_values(by=['postcode'], inplace=True)
schools_df

Unnamed: 0,school,suburb,state,postcode,schoolType,educationLevel,score
3992,Nemarluk School,ALAWA,NT,0810,Government,primary,
4083,Nightcliff Primary School,NIGHTCLIFF,NT,0810,Government,primary,
3991,Alawa Primary School,ALAWA,NT,0810,Government,primary,
4088,The Essington School,RAPID CREEK,NT,0810,Private,primary,
4087,St Paul's Catholic Primary School,RAPID CREEK,NT,0810,Catholic,primary,
...,...,...,...,...,...,...,...
7539,Strahan Primary School,STRAHAN,TAS,7468,Government,primary,
7582,Zeehan Primary School,ZEEHAN,TAS,7469,Government,primary,
7497,Rosebery District School,ROSEBERY,TAS,7470,Government,secondary,
7495,St Joseph's Catholic School,ROSEBERY,TAS,7470,Catholic,primary,


In [15]:
# Function to apply fuzzy matching
def get_best_match(row):
    name = str(row['suburb'])
    choices = au_postcodes_df[au_postcodes_df['state'] == row['state']]['suburb']
    best_match = process.extractOne(name, choices)
    if not best_match:
        return None
    return best_match[0]  # Returns the best match name

QLD_schools_with_score_df = schools_with_score_df[schools_with_score_df['postcode']==""]
QLD_schools_with_score_df['bestmatchsuburb'] = QLD_schools_with_score_df.apply(get_best_match, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  QLD_schools_with_score_df['bestmatchsuburb'] = QLD_schools_with_score_df.apply(get_best_match, axis=1)


In [17]:
QLD_schools_with_score_df

Unnamed: 0,school,suburb,state,postcode,score,school_type,educationLevel,bestmatchsuburb
717,Sunnybank Hills State School,Sunnybank Hills,QLD,,100,,primary,SUNNYBANK HILLS
718,Citipointe Christian College,Carindale,QLD,,100,,primary,CARINDALE
719,Ipswich Grammar School,Ipswich,QLD,,100,,primary,IPSWICH
720,St Peters Lutheran College,Indooroopilly,QLD,,100,,primary,INDOOROOPILLY
721,Anglican Church Grammar School,East Brisbane,QLD,,100,,primary,EAST BRISBANE
...,...,...,...,...,...,...,...,...
2070,Iona College,Lindum,QLD,,90,,secondary,WOONDUM
2071,Lutheran Ormeau Rivers District School,Pimpama,QLD,,90,,secondary,PIMPAMA
2072,Wynnum State High School,Manly,QLD,,90,,secondary,MANLY
2073,Ferny Grove State High School,Ferny Grove,QLD,,90,,secondary,FERNY GROVE


In [18]:
QLD_schools_with_score_df.drop(columns=['postcode', 'suburb'], inplace=True)
QLD_schools_with_score_df.rename(columns={'bestmatchsuburb': 'suburb'}, inplace=True)

QLD_schools_with_score_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  QLD_schools_with_score_df.drop(columns=['postcode', 'suburb'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  QLD_schools_with_score_df.rename(columns={'bestmatchsuburb': 'suburb'}, inplace=True)


Unnamed: 0,school,state,score,school_type,educationLevel,suburb
717,Sunnybank Hills State School,QLD,100,,primary,SUNNYBANK HILLS
718,Citipointe Christian College,QLD,100,,primary,CARINDALE
719,Ipswich Grammar School,QLD,100,,primary,IPSWICH
720,St Peters Lutheran College,QLD,100,,primary,INDOOROOPILLY
721,Anglican Church Grammar School,QLD,100,,primary,EAST BRISBANE
...,...,...,...,...,...,...
2070,Iona College,QLD,90,,secondary,WOONDUM
2071,Lutheran Ormeau Rivers District School,QLD,90,,secondary,PIMPAMA
2072,Wynnum State High School,QLD,90,,secondary,MANLY
2073,Ferny Grove State High School,QLD,90,,secondary,FERNY GROVE


In [19]:
au_postcodes_df_copy = au_postcodes_df.copy()
#extract and show duplicates on state and suburb
au_postcodes_df_copy.drop_duplicates(subset=['state', 'suburb'], keep="first", inplace=True)

In [20]:
QLD_schools_with_score_df = QLD_schools_with_score_df.merge(au_postcodes_df_copy, on=['suburb', 'state'], suffixes=('_l', '_r'), how='left')
QLD_schools_with_score_df

Unnamed: 0,school,state,score,school_type,educationLevel,suburb,postcode
0,Sunnybank Hills State School,QLD,100,,primary,SUNNYBANK HILLS,4109
1,Citipointe Christian College,QLD,100,,primary,CARINDALE,4152
2,Ipswich Grammar School,QLD,100,,primary,IPSWICH,4305
3,St Peters Lutheran College,QLD,100,,primary,INDOOROOPILLY,4068
4,Anglican Church Grammar School,QLD,100,,primary,EAST BRISBANE,4169
...,...,...,...,...,...,...,...
514,Iona College,QLD,90,,secondary,WOONDUM,4570
515,Lutheran Ormeau Rivers District School,QLD,90,,secondary,PIMPAMA,4209
516,Wynnum State High School,QLD,90,,secondary,MANLY,4179
517,Ferny Grove State High School,QLD,90,,secondary,FERNY GROVE,4055


In [21]:
schools_with_score_df = schools_with_score_df[['school', 'suburb', 'state', 'postcode', 'score', 'school_type', 'educationLevel']]
QLD_schools_with_score_df = QLD_schools_with_score_df[['school', 'suburb', 'state', 'postcode', 'score', 'school_type', 'educationLevel']]

schools_with_score_df = pd.concat([schools_with_score_df[schools_with_score_df['state']!='QLD'], QLD_schools_with_score_df], ignore_index=True)
schools_with_score_df

Unnamed: 0,school,suburb,state,postcode,score,school_type,educationLevel
0,Haileybury Rendall School,Berrimah,NT,0828,100,,primary
1,The Essington School Darwin,Nightcliff,NT,0810,100,,primary
2,Nhulunbuy Christian School,Nhulunbuy,NT,0880,99,,primary
3,Katherine School Of The Air,Katherine,NT,0850,99,,primary
4,Milkwood Steiner School,Berrimah,NT,0828,99,,primary
...,...,...,...,...,...,...,...
2247,Iona College,WOONDUM,QLD,4570,90,,secondary
2248,Lutheran Ormeau Rivers District School,PIMPAMA,QLD,4209,90,,secondary
2249,Wynnum State High School,MANLY,QLD,4179,90,,secondary
2250,Ferny Grove State High School,FERNY GROVE,QLD,4055,90,,secondary


In [22]:
# Preprocessing function for school names
def preprocess_school_names(name):
    # Implement your preprocessing logic here
    # For example, lowercasing, removing common suffixes/prefixes, etc.
    name.lower()
    name = name.replace('primary', '')
    name = name.replace('secondary', '')
    name = name.replace('school', '')
    name = name.replace('college', '')
    name = name.replace('public', '')
    name = name.replace('private', '')
    return name

# Apply preprocessing
schools_with_score_df['school_cleaned'] = schools_with_score_df['school'].apply(preprocess_school_names)
schools_df['school_cleaned'] = schools_df['school'].apply(preprocess_school_names)

# Function to apply fuzzy matching
def get_best_match(row):
    name = row['school_cleaned']
    choices = schools_df[(schools_df['postcode'] == str(row['postcode'])) & (df2['educationLevel'] == row['educationLevel'])]['school_cleaned']
    best_match = process.extractOne(name, choices, score_cutoff=80)
    if not best_match:
        return None
    return best_match[0]  # Returns the best match name

schools_with_score_df['BestMatchName'] = schools_with_score_df.apply(get_best_match, axis=1)

In [26]:
# add a column calculating similarity between school_cleaned and BestMatchName
schools_with_score_df['similarity'] = schools_with_score_df.apply(lambda row: fuzz.ratio(row['school_cleaned'], row['BestMatchName']), axis=1)
schools_with_score_df[(schools_with_score_df.duplicated(subset=['BestMatchName', 'postcode', 'educationLevel'], keep=False)) & (schools_with_score_df['state']=="NSW")].sort_values(by=['BestMatchName', 'postcode', 'educationLevel', 'similarity'], ascending=False)


Unnamed: 0,school,suburb,state,postcode,score,school_type,educationLevel,school_cleaned,BestMatchName,similarity
1193,Sydney Girls High School,Surry Hills,NSW,2010,100,,secondary,Sydney Girls High School,Sydney Girls High School,100
1192,Sydney Boys High School,Surry Hills,NSW,2010,100,,secondary,Sydney Boys High School,Sydney Girls High School,85
165,Sydney Distance Education Primary School,Surry Hills,NSW,2010,97,,primary,Sydney Distance Education Primary School,Sydney Distance Education Primary School,100
36,Sydney Grammar School,Darlinghurst,NSW,2010,100,,primary,Sydney Grammar School,Sydney Distance Education Primary School,59
1221,St George Christian School,Hurstville,NSW,2220,98,,secondary,St George Christian School,St George Christian School,100
1246,Danebank School,Hurstville,NSW,2220,97,,secondary,Danebank School,St George Christian School,49
177,North Sydney Public School,Waverton,NSW,2060,96,,primary,North Sydney Public School,North Sydney Public School,100
79,SHORE - Sydney Church of England Grammar School,North Sydney,NSW,2060,99,,primary,SHORE - Sydney Church of England Grammar School,North Sydney Public School,47
147,Italian Bilingual School,Meadowbank,NSW,2114,98,,primary,Italian Bilingual School,Italian Bilingual School,100
214,St Therese's Catholic Primary School,Denistone,NSW,2114,PNO,,primary,St Therese's Catholic Primary School,Italian Bilingual School,47


In [27]:
schools_with_score_df = schools_with_score_df.sort_values(by=['BestMatchName', 'postcode', 'educationLevel', 'similarity'], ascending=False)
schools_with_score_df.drop_duplicates(subset=['BestMatchName', 'postcode', 'educationLevel'], keep='first', inplace=True)
schools_with_score_df = schools_with_score_df[schools_with_score_df['BestMatchName'].notnull()]
schools_with_score_df = schools_with_score_df[schools_with_score_df['similarity'] >= 80]
#concate 3 columns bestmatchname, postcode, educationLevel to create a unique key
schools_with_score_df['key'] = schools_with_score_df['BestMatchName'].astype(str) + schools_with_score_df['postcode'].astype(str) + schools_with_score_df['educationLevel'].astype(str)

schools_with_score_df

Unnamed: 0,school,suburb,state,postcode,score,school_type,educationLevel,school_cleaned,BestMatchName,similarity,key
1021,Yuluma Primary School,Innaloo,WA,6018,94,,primary,Yuluma Primary School,Yuluma Primary School,100,Yuluma Primary School6018primary
1934,Yorkeys Knob State School,YORKEYS KNOB,QLD,4878,94,,primary,Yorkeys Knob State School,Yorkeys Knob State School,100,Yorkeys Knob State School4878primary
1029,Yokine Primary School,Yokine,WA,6060,94,,primary,Yokine Primary School,Yokine Primary School,100,Yokine Primary School6060primary
475,Yinnar Primary School,,VIC,3869,95,,primary,Yinnar Primary School,Yinnar Primary School,100,Yinnar Primary School3869primary
1518,Yesodei Hatorah College,Elwood,VIC,3184,92,,secondary,Yesodei Hatorah College,Yesodei HaTorah College,96,Yesodei HaTorah College3184secondary
...,...,...,...,...,...,...,...,...,...,...,...
1206,Abbotsleigh,Wahroonga,NSW,2076,99,,secondary,Abbotsleigh,Abbotsleigh,100,Abbotsleigh2076secondary
38,Abbotsleigh,Wahroonga,NSW,2076,100,,primary,Abbotsleigh,Abbotsleigh,100,Abbotsleigh2076primary
493,Abbotsford Primary School,,VIC,3067,95,,primary,Abbotsford Primary School,Abbotsford Primary School,100,Abbotsford Primary School3067primary
2107,A B Paterson College,ARUNDEL,QLD,4214,99,,secondary,A B Paterson College,A B Paterson College,100,A B Paterson College4214secondary


In [29]:
schools_df['key'] = schools_df['school_cleaned'].astype(str) + schools_df['postcode'].astype(str) + schools_df['educationLevel'].astype(str)
schools_df

Unnamed: 0,school,suburb,state,postcode,schoolType,educationLevel,score,school_cleaned,key
3992,Nemarluk School,ALAWA,NT,0810,Government,primary,,Nemarluk School,Nemarluk School0810primary
4083,Nightcliff Primary School,NIGHTCLIFF,NT,0810,Government,primary,,Nightcliff Primary School,Nightcliff Primary School0810primary
3991,Alawa Primary School,ALAWA,NT,0810,Government,primary,,Alawa Primary School,Alawa Primary School0810primary
4088,The Essington School,RAPID CREEK,NT,0810,Private,primary,,The Essington School,The Essington School0810primary
4087,St Paul's Catholic Primary School,RAPID CREEK,NT,0810,Catholic,primary,,St Paul's Catholic Primary School,St Paul's Catholic Primary School0810primary
...,...,...,...,...,...,...,...,...,...
7539,Strahan Primary School,STRAHAN,TAS,7468,Government,primary,,Strahan Primary School,Strahan Primary School7468primary
7582,Zeehan Primary School,ZEEHAN,TAS,7469,Government,primary,,Zeehan Primary School,Zeehan Primary School7469primary
7497,Rosebery District School,ROSEBERY,TAS,7470,Government,secondary,,Rosebery District School,Rosebery District School7470secondary
7495,St Joseph's Catholic School,ROSEBERY,TAS,7470,Catholic,primary,,St Joseph's Catholic School,St Joseph's Catholic School7470primary


In [49]:
final = pd.merge(schools_df, schools_with_score_df, how='left', left_on=['key'], right_on=['key'], suffixes=('_l', '_r'))
final[final['school_cleaned_r'].notnull()].drop_duplicates(subset=['school_cleaned_l', 'postcode_l', 'educationLevel_l'], keep='first')

Unnamed: 0,school_l,suburb_l,state_l,postcode_l,schoolType,educationLevel_l,score_l,school_cleaned_l,key,school_r,suburb_r,state_r,postcode_r,score_r,school_type,educationLevel_r,school_cleaned_r,BestMatchName,similarity
1,Nightcliff Primary School,NIGHTCLIFF,NT,0810,Government,primary,,Nightcliff Primary School,Nightcliff Primary School0810primary,Nightcliff Primary School,Nightcliff,NT,0810,98,,primary,Nightcliff Primary School,Nightcliff Primary School,100.0
2,Alawa Primary School,ALAWA,NT,0810,Government,primary,,Alawa Primary School,Alawa Primary School0810primary,Alawa Primary School,Alawa,NT,0810,96,,primary,Alawa Primary School,Alawa Primary School,100.0
3,The Essington School,RAPID CREEK,NT,0810,Private,primary,,The Essington School,The Essington School0810primary,The Essington School Darwin,Nightcliff,NT,0810,100,,primary,The Essington School Darwin,The Essington School,85.0
4,St Paul's Catholic Primary School,RAPID CREEK,NT,0810,Catholic,primary,,St Paul's Catholic Primary School,St Paul's Catholic Primary School0810primary,St Paul's Catholic Primary School,Nightcliff,NT,0810,97,,primary,St Paul's Catholic Primary School,St Paul's Catholic Primary School,100.0
8,Jingili Primary School,JINGILI,NT,0810,Government,primary,,Jingili Primary School,Jingili Primary School0810primary,Jingili Primary School,Jingili,NT,0810,94,,primary,Jingili Primary School,Jingili Primary School,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12114,Leighland Christian School,ULVERSTONE,TAS,7315,Private,secondary,,Leighland Christian School,Leighland Christian School7315secondary,Leighland Christian School,Ulverstone,TAS,7315,96,,secondary,Leighland Christian School,Leighland Christian School,100.0
12123,Riana Primary School,RIANA,TAS,7316,Government,primary,,Riana Primary School,Riana Primary School7316primary,Riana Primary School,Riana,TAS,7316,97,,primary,Riana Primary School,Riana Primary School,100.0
12129,Marist Regional College,BURNIE,TAS,7320,Catholic,secondary,,Marist Regional College,Marist Regional College7320secondary,Marist Regional College,Burnie,TAS,7320,93,,secondary,Marist Regional College,Marist Regional College,100.0
12130,Stella Maris Catholic School,BURNIE,TAS,7320,Catholic,primary,,Stella Maris Catholic School,Stella Maris Catholic School7320primary,Stella Maris Catholic Primary School,Burnie,TAS,7320,94,,primary,Stella Maris Catholic Primary School,Stella Maris Catholic School,88.0


In [50]:
final = final[['school_l', 'suburb_l', 'state_l', 'postcode_l', 'score_r', 'schoolType', 'educationLevel_l']]
final.rename(columns={'school_l': 'school', 'suburb_l': 'suburb', 'state_l': 'state', 'postcode_l': 'postcode', 'score_r': 'score', 'educationLevel_l': 'educationLevel'}, inplace=True)

In [51]:
final.drop_duplicates(subset=['school', 'postcode', 'educationLevel'], keep='first', inplace=True)

In [54]:
final

Unnamed: 0,school,suburb,state,postcode,score,schoolType,educationLevel
0,Nemarluk School,ALAWA,NT,0810,,Government,primary
1,Nightcliff Primary School,NIGHTCLIFF,NT,0810,98,Government,primary
2,Alawa Primary School,ALAWA,NT,0810,96,Government,primary
3,The Essington School,RAPID CREEK,NT,0810,100,Private,primary
4,St Paul's Catholic Primary School,RAPID CREEK,NT,0810,97,Catholic,primary
...,...,...,...,...,...,...,...
12161,Strahan Primary School,STRAHAN,TAS,7468,,Government,primary
12162,Zeehan Primary School,ZEEHAN,TAS,7469,,Government,primary
12163,Rosebery District School,ROSEBERY,TAS,7470,,Government,secondary
12164,St Joseph's Catholic School,ROSEBERY,TAS,7470,,Catholic,primary


In [None]:
final.to_csv('D:\\aus_real_estate_data\schools\schools.csv', index=False)