In [35]:
import warnings
import pandas as pd
from pathlib import Path

In [36]:
# File Path
incidents_data_to_load = Path("incidents_part1_part2.csv")
school_metrics_data_to_load=Path("SPREE_SY2223_School_Metric_Scores (1).csv")

# Read Hospital General, Score, and Location Data File and store into Pandas DataFrames
incidents_data = pd.read_csv(incidents_data_to_load)
school_metrics_data = pd.read_csv(school_metrics_data_to_load)


In [37]:
incidents_data.head()

Unnamed: 0,the_geom,cartodb_id,the_geom_webmercator,objectid,dc_dist,psa,dispatch_date_time,dispatch_date,dispatch_time,hour,dc_key,location_block,ucr_general,text_general_code,point_x,point_y,lat,lng
0,0101000020E6100000439FB8B153CC52C0B6D18416A7F9...,3097150,0101000020110F00007AD170B738EE5FC136FD1FB6DC88...,13965196,18,3,2024-02-02 16:30:00+00,2024-02-02,11:30:00,11.0,202418000000.0,300 BLOCK S 34TH ST,600,Thefts,-75.192608,39.950412,39.950412,-75.192608
1,,3097416,,17545922,17,1,2024-06-15 12:10:00+00,2024-06-15,08:10:00,8.0,202417000000.0,2100 BLOCK SOUTH ST,600,Thefts,,,,
2,,3097550,,13065664,5,1,2024-01-06 20:35:00+00,2024-01-06,15:35:00,15.0,202405000000.0,300 BLOCK ROCHELLE AV,600,Thefts,,,,
3,,3098085,,15737696,3,3,2024-04-26 21:13:00+00,2024-04-26,17:13:00,17.0,202403000000.0,1100 BLOCK ELLSWORTH ST,1100,Fraud,,,,
4,0101000020E6100000B10E2EBA48C452C015DB8B059101...,3098523,0101000020110F0000117C9F5A8FE05FC117D42E37A291...,14865984,15,2,2024-03-15 11:17:00+00,2024-03-15,07:17:00,7.0,202415000000.0,5500 BLOCK TULIP ST,600,Thefts,-75.066939,40.012238,40.012238,-75.066939


In [38]:
incidents_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85529 entries, 0 to 85528
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   the_geom              79988 non-null  object 
 1   cartodb_id            85529 non-null  int64  
 2   the_geom_webmercator  79940 non-null  object 
 3   objectid              85529 non-null  int64  
 4   dc_dist               85529 non-null  int64  
 5   psa                   85495 non-null  object 
 6   dispatch_date_time    85529 non-null  object 
 7   dispatch_date         85529 non-null  object 
 8   dispatch_time         85529 non-null  object 
 9   hour                  85527 non-null  float64
 10  dc_key                85529 non-null  float64
 11  location_block        85513 non-null  object 
 12  ucr_general           85529 non-null  int64  
 13  text_general_code     85529 non-null  object 
 14  point_x               79940 non-null  float64
 15  point_y            

In [39]:
## 'cartodb_id' is the unique identifier for the crime incident record.
## 'dc_dist' is the code for the police district responsible for the area where the incident took place.
## 'text_general_code' provides a textual description of the incident type.
## 'ucr_general' provides a standardized crime code.
## 'location' specifies the geographic location of the incident.

In [40]:
text_general_code_count=incidents_data['text_general_code'].nunique()
ucr_general_count=incidents_data['ucr_general'].nunique()
print("Number of unique general codes for incident type:", text_general_code_count)
print("Number of unique general ucr for incident type:", ucr_general_count)

Number of unique general codes for incident type: 31
Number of unique general ucr for incident type: 26


In [41]:
# Calculate the number of unique general codes and UCR codes for incident type
text_general_code_count = incidents_data['text_general_code'].nunique()
ucr_general_count = incidents_data['ucr_general'].nunique()

print("Number of unique general codes for incident type:", text_general_code_count)
print("Number of unique general UCR codes for incident type:", ucr_general_count)

# Group incidents by their UCR code and list them
ucr_incidents = incidents_data.groupby('ucr_general')['text_general_code'].unique()

# Convert the groupby object to a DataFrame for better readability
ucr_incidents_df = ucr_incidents.reset_index()
ucr_incidents_df.columns = ['UCR General Code', 'Incidents']

ucr_incidents_df


Number of unique general codes for incident type: 31
Number of unique general UCR codes for incident type: 26


Unnamed: 0,UCR General Code,Incidents
0,100,"[Homicide - Criminal, Homicide - Justifiable]"
1,200,[Rape]
2,300,"[Robbery Firearm, Robbery No Firearm]"
3,400,"[Aggravated Assault No Firearm, Aggravated Ass..."
4,500,"[Burglary Non-Residential, Burglary Residential]"
5,600,"[Thefts, Theft from Vehicle]"
6,700,[Motor Vehicle Theft]
7,800,[Other Assaults]
8,900,[Arson]
9,1000,[Forgery and Counterfeiting]


In [42]:
# Calculate the number of unique general codes and UCR codes for incident type
dc_dist_count = incidents_data['dc_dist'].nunique()
psa_count = incidents_data['psa'].nunique()
objectid_count= incidents_data['objectid'].nunique()
dc_key_count=incidents_data['dc_key'].nunique()

print("Number police districts responsible for the areas where the incidents took place:", dc_dist_count)
print("Number police service areas where the incidents took place:", psa_count)
print("Number object ids for incidents:", objectid_count)
print("Number dc_key_counts for incidents:", dc_key_count)


Number police districts responsible for the areas where the incidents took place: 22
Number police service areas where the incidents took place: 6
Number object ids for incidents: 85529
Number dc_key_counts for incidents: 85520


In [43]:
# Identify duplicates based on 'dc_key' 
duplicates = incidents_data[incidents_data.duplicated(subset=['dc_key'], keep=False)]

print("Duplicates based on 'dc_key':")
duplicates
#duplicates.count()


Duplicates based on 'dc_key':


Unnamed: 0,the_geom,cartodb_id,the_geom_webmercator,objectid,dc_dist,psa,dispatch_date_time,dispatch_date,dispatch_time,hour,dc_key,location_block,ucr_general,text_general_code,point_x,point_y,lat,lng
19063,0101000020E610000008F53E9F52C952C0B36AC658DC00...,3178665,0101000020110F00001B96BE5E1EE95FC1AC6968E0D990...,18326613,25,4,2024-07-14 03:12:00+00,2024-07-13,23:12:00,23.0,202425000000.0,3500 BLOCK N WARNOCK ST,400,Aggravated Assault Firearm,-75.145668,40.006724,40.006724,-75.145668
20439,0101000020E6100000CBBAA03A2DC352C0B7A5C464E005...,3180345,0101000020110F00008CA0BECDADDE5FC1A866C1066A96...,15265578,15,3,2024-04-08 18:39:00+00,2024-04-08,14:39:00,14.0,202415000000.0,7400 BLOCK REVERE ST,400,Aggravated Assault Firearm,-75.049636,40.04591,40.04591,-75.049636
29540,0101000020E6100000F7BD76FEE7C952C0605B3FFDE7FA...,3191206,0101000020110F00002770E7171CEA5FC18153AE41408A...,18383386,6,1,2024-04-07 04:00:00+00,2024-04-07,00:00:00,9.0,202406000000.0,900 BLOCK HAMILTON ST,100,Homicide - Criminal,-75.154785,39.960205,39.960205,-75.154785
32818,,3197371,,18383324,2,2,2024-05-21 04:00:00+00,2024-05-21,00:00:00,14.0,202402000000.0,6000 BLOCK BINGHAM ST,100,Homicide - Criminal,,,,
32819,,3197372,,18383325,2,2,2024-05-21 04:00:00+00,2024-05-21,00:00:00,14.0,202402000000.0,6000 BLOCK BINGHAM ST,100,Homicide - Criminal,,,,
33071,,3197847,,18383336,16,3,2024-04-25 04:00:00+00,2024-04-25,00:00:00,17.0,202416000000.0,2500 BLOCK A ST,100,Homicide - Criminal,,,,
33329,,3198336,,18383347,22,4,2024-02-29 05:00:00+00,2024-02-29,00:00:00,23.0,202422000000.0,3800 BLOCK MT PLEASANT AVE,100,Homicide - Criminal,,,,
33333,,3198341,,18383350,15,3,2024-04-09 04:00:00+00,2024-04-09,00:00:00,15.0,202415000000.0,7400 BLOCK REVERE ST,100,Homicide - Criminal,,,,
33349,,3198358,,18383346,22,4,2024-02-29 05:00:00+00,2024-02-29,00:00:00,23.0,202422000000.0,3800 BLOCK MT PLEASANT DR,100,Homicide - Criminal,,,,
33356,,3198365,,18383363,25,4,2024-07-14 04:00:00+00,2024-07-14,00:00:00,23.0,202425000000.0,1000 BLOCK W VENANGO ST,100,Homicide - Criminal,,,,


In [44]:
#Make a new data frame for Incident Information and Removing Columns and Rows with NaN

incidents_data_df = pd.DataFrame({
    'Incident ID': incidents_data['objectid'],
    'District': incidents_data['dc_dist'],
    'Police Service Area': incidents_data['psa'],  
    'Dispatch Date and Time': incidents_data['dispatch_date_time'],
    'Dispatch Date': incidents_data['dispatch_date'],
    'Dispatch Time': incidents_data['dispatch_time'],      
    'Dispatch Hour': incidents_data['hour'],
    'Location Block': incidents_data['location_block'],
    'UCR Code': incidents_data['ucr_general'],
    'General Crime Category': incidents_data['text_general_code']
})

incidents_data_df

Unnamed: 0,Incident ID,District,Police Service Area,Dispatch Date and Time,Dispatch Date,Dispatch Time,Dispatch Hour,Location Block,UCR Code,General Crime Category
0,13965196,18,3,2024-02-02 16:30:00+00,2024-02-02,11:30:00,11.0,300 BLOCK S 34TH ST,600,Thefts
1,17545922,17,1,2024-06-15 12:10:00+00,2024-06-15,08:10:00,8.0,2100 BLOCK SOUTH ST,600,Thefts
2,13065664,5,1,2024-01-06 20:35:00+00,2024-01-06,15:35:00,15.0,300 BLOCK ROCHELLE AV,600,Thefts
3,15737696,3,3,2024-04-26 21:13:00+00,2024-04-26,17:13:00,17.0,1100 BLOCK ELLSWORTH ST,1100,Fraud
4,14865984,15,2,2024-03-15 11:17:00+00,2024-03-15,07:17:00,7.0,5500 BLOCK TULIP ST,600,Thefts
...,...,...,...,...,...,...,...,...,...,...
85524,16408002,24,1,2024-05-16 04:57:00+00,2024-05-16,00:57:00,0.0,3800 BLOCK CASTOR AV,2600,All Other Offenses
85525,16408889,22,4,2024-05-20 21:52:00+00,2024-05-20,17:52:00,17.0,2500 BLOCK W Girard Ave,700,Motor Vehicle Theft
85526,16408466,35,3,2024-05-13 16:10:00+00,2024-05-13,12:10:00,12.0,6400 BLOCK N 16TH ST,700,Motor Vehicle Theft
85527,16408899,18,3,2024-05-19 20:51:00+00,2024-05-19,16:51:00,16.0,S 40TH ST & SPRUCE ST,700,Motor Vehicle Theft


In [45]:
# First, drop rows with any NaN values
incidents_data_df = incidents_data_df.dropna(how='any')

# Then, drop rows where 'Not Available' is present in any column
incidents_data_df = incidents_data_df[~incidents_data_df.isin(['Not Available']).any(axis=1)]

columns_to_convert = ['Incident ID',
    'District',
    'Police Service Area',  
    'Dispatch Date and Time',
    'Dispatch Date',
    'Dispatch Time',      
    'Dispatch Hour',
    'Location Block',
    'UCR Code',
    'General Crime Category']

new_incidents_data_df = incidents_data_df.copy()  # Copy the original DataFrame

new_incidents_data_df

Unnamed: 0,Incident ID,District,Police Service Area,Dispatch Date and Time,Dispatch Date,Dispatch Time,Dispatch Hour,Location Block,UCR Code,General Crime Category
0,13965196,18,3,2024-02-02 16:30:00+00,2024-02-02,11:30:00,11.0,300 BLOCK S 34TH ST,600,Thefts
1,17545922,17,1,2024-06-15 12:10:00+00,2024-06-15,08:10:00,8.0,2100 BLOCK SOUTH ST,600,Thefts
2,13065664,5,1,2024-01-06 20:35:00+00,2024-01-06,15:35:00,15.0,300 BLOCK ROCHELLE AV,600,Thefts
3,15737696,3,3,2024-04-26 21:13:00+00,2024-04-26,17:13:00,17.0,1100 BLOCK ELLSWORTH ST,1100,Fraud
4,14865984,15,2,2024-03-15 11:17:00+00,2024-03-15,07:17:00,7.0,5500 BLOCK TULIP ST,600,Thefts
...,...,...,...,...,...,...,...,...,...,...
85524,16408002,24,1,2024-05-16 04:57:00+00,2024-05-16,00:57:00,0.0,3800 BLOCK CASTOR AV,2600,All Other Offenses
85525,16408889,22,4,2024-05-20 21:52:00+00,2024-05-20,17:52:00,17.0,2500 BLOCK W Girard Ave,700,Motor Vehicle Theft
85526,16408466,35,3,2024-05-13 16:10:00+00,2024-05-13,12:10:00,12.0,6400 BLOCK N 16TH ST,700,Motor Vehicle Theft
85527,16408899,18,3,2024-05-19 20:51:00+00,2024-05-19,16:51:00,16.0,S 40TH ST & SPRUCE ST,700,Motor Vehicle Theft


In [46]:
district_psa_df=pd.DataFrame({
    'District':incidents_data['dc_dist'],
    'Police Service Area':incidents_data['psa']
})

# Sort by 'District' first and then by 'Police Service Area'
district_psa_df = district_psa_df.sort_values(by=['District', 'Police Service Area'])

# Reset index if you want a clean index
district_psa_df = district_psa_df.reset_index(drop=True)

district_psa_df

Unnamed: 0,District,Police Service Area
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
...,...,...
85524,77,
85525,77,
85526,77,
85527,77,


In [47]:
district_df=district_psa_df.groupby('District')['Police Service Area'].unique()

# Remove NaN values from the unique arrays
district_df = district_df.apply(lambda x: x[~pd.isnull(x)])

# Convert the series to a DataFrame
district_df = district_df.reset_index()

# Rename columns for clarity
district_df.columns = ['District', 'Unique Police Service Areas']

district_df

# Make an output for the new dataset
#output_file_path= 'district_df.csv'

Unnamed: 0,District,Unique Police Service Areas
0,1,"[1, 2]"
1,2,"[1, 2, 3]"
2,3,"[1, 2, 3]"
3,5,"[1, 2, 3]"
4,6,"[1, 2, 3]"
5,7,"[1, 2, 3]"
6,8,"[1, 2, 3]"
7,9,"[1, 2, 3, 4, 5]"
8,12,"[1, 2, 3, 4]"
9,14,"[1, 2, 3, 4]"


In [48]:
incident_code_df=pd.DataFrame({
    'UCR Code':incidents_data['ucr_general'],
    'General Crime Category':incidents_data['text_general_code']
})

# Sort by 'UCR Code' first and then by 'General Crime Category'
incident_code_df = incident_code_df.sort_values(by=['UCR Code'])

# Reset index if you want a clean index
incident_code_df = incident_code_df.reset_index(drop=True)

incident_code_df



Unnamed: 0,UCR Code,General Crime Category
0,100,Homicide - Criminal
1,100,Homicide - Criminal
2,100,Homicide - Criminal
3,100,Homicide - Criminal
4,100,Homicide - Criminal
...,...,...
85524,2600,All Other Offenses
85525,2600,All Other Offenses
85526,2600,All Other Offenses
85527,2600,All Other Offenses


In [60]:
# Define the crime groups
group_mapping = {
    'Violent Crimes': ['Homicide - Criminal', 'Homicide - Justifiable', 'Rape', 'Robbery No Firearm', 'Robbery Firearm', 'Aggravated Assault Firearm', 'Aggravated Assault No Firearm', 'Other Assaults'],
    'Property Crimes': ['Burglary Residential', 'Burglary Non-Residential', 'Thefts', 'Theft from Vehicle', 'Motor Vehicle Theft', 'Arson', 'Forgery and Counterfeiting', 'Receiving Stolen Property', 'Vandalism/Criminal Mischief'],
    'Public Order Crimes': ['Disorderly Conduct', 'Vagrancy/Loitering', 'Public Drunkenness', 'Liquor Law Violations', 'DRIVING UNDER THE INFLUENCE'],
    'Drug and Vice Crimes': ['Prostitution and Commercialized Vice', 'Narcotic / Drug Law Violations', 'Other Sex Offenses (Not Commercialized)', 'Gambling Violations'],
    'Other Crimes': ['Offenses Against Family and Children', 'Fraud', 'Embezzlement', 'Weapon Violations', 'All Other Offenses']
}

# Reverse the mapping to map each crime category to its group
crime_to_group = {crime: group for group, crimes in group_mapping.items() for crime in crimes}

# Create a new column for the crime group
new_incidents_data_df['Crime Group'] = new_incidents_data_df['General Crime Category'].map(crime_to_group)

new_incidents_data_df

Unnamed: 0,Incident ID,District,Police Service Area,Dispatch Date and Time,Dispatch Date,Dispatch Time,Dispatch Hour,Location Block,UCR Code,General Crime Category,Crime Group
0,13965196,18,3,2024-02-02 16:30:00+00,2024-02-02,11:30:00,11.0,300 BLOCK S 34TH ST,600,Thefts,Property Crimes
1,17545922,17,1,2024-06-15 12:10:00+00,2024-06-15,08:10:00,8.0,2100 BLOCK SOUTH ST,600,Thefts,Property Crimes
2,13065664,5,1,2024-01-06 20:35:00+00,2024-01-06,15:35:00,15.0,300 BLOCK ROCHELLE AV,600,Thefts,Property Crimes
3,15737696,3,3,2024-04-26 21:13:00+00,2024-04-26,17:13:00,17.0,1100 BLOCK ELLSWORTH ST,1100,Fraud,Other Crimes
4,14865984,15,2,2024-03-15 11:17:00+00,2024-03-15,07:17:00,7.0,5500 BLOCK TULIP ST,600,Thefts,Property Crimes
...,...,...,...,...,...,...,...,...,...,...,...
85524,16408002,24,1,2024-05-16 04:57:00+00,2024-05-16,00:57:00,0.0,3800 BLOCK CASTOR AV,2600,All Other Offenses,Other Crimes
85525,16408889,22,4,2024-05-20 21:52:00+00,2024-05-20,17:52:00,17.0,2500 BLOCK W Girard Ave,700,Motor Vehicle Theft,Property Crimes
85526,16408466,35,3,2024-05-13 16:10:00+00,2024-05-13,12:10:00,12.0,6400 BLOCK N 16TH ST,700,Motor Vehicle Theft,Property Crimes
85527,16408899,18,3,2024-05-19 20:51:00+00,2024-05-19,16:51:00,16.0,S 40TH ST & SPRUCE ST,700,Motor Vehicle Theft,Property Crimes


In [62]:
# Make an output for the new dataset
output_file_path = 'new_incidents_data_df.csv'

# Save the DataFrame to a CSV file
new_incidents_data_df.to_csv(output_file_path, index=False)

print(f"DataFrame saved to {output_file_path}")

DataFrame saved to new_incidents_data_df.csv


In [50]:
# Group by 'UCR Code' and aggregate unique values of 'General Crime Category' and 'Crime Group'
incident_df = new_incidents_data_df.groupby('UCR Code').agg({'General Crime Category': lambda x: x.unique(), 'Crime Group': lambda x: x.unique()})

# Convert the series to a DataFrame
incident_df = incident_df.reset_index()

# Make an output for the new dataset
output_file_path = 'incident_df.csv'
incident_df.to_csv(output_file_path, index=False)

incident_df

Unnamed: 0,UCR Code,General Crime Category,Crime Group
0,100,"[Homicide - Criminal, Homicide - Justifiable]",[Violent Crimes]
1,200,[Rape],[Violent Crimes]
2,300,"[Robbery Firearm, Robbery No Firearm]",[Violent Crimes]
3,400,"[Aggravated Assault No Firearm, Aggravated Ass...",[Violent Crimes]
4,500,"[Burglary Non-Residential, Burglary Residential]",[Property Crimes]
5,600,"[Thefts, Theft from Vehicle]",[Property Crimes]
6,700,[Motor Vehicle Theft],[Property Crimes]
7,800,[Other Assaults],[Violent Crimes]
8,900,[Arson],[Property Crimes]
9,1000,[Forgery and Counterfeiting],[Property Crimes]


In [51]:
import geopandas as gpd

# Read the GeoJSON file into a GeoDataFrame
geo_schools_df = gpd.read_file('Schools.geojson')

# If you want to convert it to a pandas DataFrame (which will drop the geometry column), you can use:
schools_df = geo_schools_df.drop(columns='geometry')

# Print the GeoDataFrame and DataFrame
geo_schools_df


Unnamed: 0,OBJECTID,AUN,SCHOOL_NUM,LOCATION_ID,SCHOOL_NAME,SCHOOL_NAME_LABEL,STREET_ADDRESS,ZIP_CODE,PHONE_NUMBER,ACTIVE,GRADE_LEVEL,GRADE_ORG,ENROLLMENT,TYPE,TYPE_SPECIFIC,geometry
0,1,226519902.0,,,WEST CATHOLIC PREPARATORY HIGH SCHOOL,WEST PHILADELPHIA CATHOLIC HIGH SCHOOL,4501-17 CHESTNUT ST,19139,(215) 386-2244,O,HIGH SCHOOL,9-12,,3,ARCHDIOCESE,POINT (-75.21170 39.95725)
1,2,226519382.0,,,ST THOMAS AQUINAS SCHOOL,ST THOMAS AQUINAS SCHOOL,1616 S 17TH ST,19145,(215) 334-0878,O,ELEMENTARY/MIDDLE,PREK-8,,3,PRIVATE,POINT (-75.17469 39.93040)
2,3,226519222.0,,,ST RAYMOND OF PENAFORT SCHOOL,ST RAYMOND SCHOOL,1330 VERNON RD,19150,(215) 548-1919,O,ELEMENTARY/MIDDLE,PREK-8,,3,PRIVATE,POINT (-75.16712 40.07199)
3,4,226519422.0,,,BLESSED TRINITY REGIONAL CATHOLIC SCHOOL,BLESSED TRINITY REGIONAL CATHOLIC SCHOOL,3033 LEVICK ST,19149,(215) 338-9797,O,ELEMENTARY/MIDDLE,PREK-8,,3,ARCHDIOCESE,POINT (-75.06043 40.03079)
4,5,226519142.0,,,ST PETER THE APOSTLE SCHOOL,ST PETER THE APOSTLE SCHOOL,1009 N 5TH ST,19123,(215) 922-5958,O,ELEMENTARY/MIDDLE,PREK-8,,3,ARCHDIOCESE,POINT (-75.14488 39.96959)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486,487,,,8890,ONE BRIGHT RAY-SIMPSON CAMPUS,ONE BRIGHT RAY - SIMPSON EVENING,1142 E ERIE AVE,19124,(215) 744-6000,,HIGH SCHOOL,9-12,131.0,1,CONTRACTED,POINT (-75.10560 40.00528)
487,488,133513315.0,8149.0,3406,JOHN B STETSON MIDDLE SCHOOL,JOHN B STETSON MIDDLE SCHOOL CONTINUATION ACADEMY,3200 B ST,19134,(215) 400-9150,,,5-8,19.0,1,CONTRACTED,POINT (-75.12564 39.99873)
488,489,,,,,WISSAHICKON CHARTER SCHOOL,815 E WASHINGTON LN,19138,(267) 774-4370,,ELEMENTARY/MIDDLE,K-8,,2,CHARTER,POINT (-75.17322 40.05205)
489,490,,,,EXCEL ACADEMY CENTRAL,EXCEL ACADEMY CENTRAL,201 E OLNEY AVE,19120,,,HIGH SCHOOL,9-12,177.0,1,CONTRACTED,POINT (-75.11804 40.03526)


In [52]:
geo_schools_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 491 entries, 0 to 490
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   OBJECTID           491 non-null    int64   
 1   AUN                451 non-null    float64 
 2   SCHOOL_NUM         286 non-null    float64 
 3   LOCATION_ID        328 non-null    object  
 4   SCHOOL_NAME        483 non-null    object  
 5   SCHOOL_NAME_LABEL  491 non-null    object  
 6   STREET_ADDRESS     491 non-null    object  
 7   ZIP_CODE           491 non-null    object  
 8   PHONE_NUMBER       490 non-null    object  
 9   ACTIVE             131 non-null    object  
 10  GRADE_LEVEL        490 non-null    object  
 11  GRADE_ORG          491 non-null    object  
 12  ENROLLMENT         329 non-null    float64 
 13  TYPE               491 non-null    int64   
 14  TYPE_SPECIFIC      491 non-null    object  
 15  geometry           491 non-null    geometry
dtype

In [53]:


# Rename columns for clarity
geo_schools_df = geo_schools_df.rename(columns={
    'LOCATION_ID':'school_id'
})

# Include the geometry column in the list of desired columns
columns_schools_with_geometry = [
    'OBJECTID',
    'school_id',
    'SCHOOL_NAME',
    'SCHOOL_NAME_LABEL',
    'STREET_ADDRESS',
    'ZIP_CODE',
    'PHONE_NUMBER',
    'GRADE_LEVEL',
    'GRADE_ORG',
    'ENROLLMENT',
    'TYPE',
    'TYPE_SPECIFIC',
    'geometry'  # Include the geometry column
]

# Drop rows where 'school_id' is None
new_geo_schools_df = geo_schools_df[geo_schools_df['school_id'].notna()]

# Select only the desired columns, including the geometry column
new_geo_schools_df = new_geo_schools_df[columns_schools_with_geometry]

# Reset index if you want a clean index
new_geo_schools_df = new_geo_schools_df.reset_index(drop=True)

# Check the result
new_geo_schools_df

# Make an output for the new dataset
output_file_path= 'new_geo_schools_df.csv'

# Save the DataFrame to a CSV file
new_geo_schools_df.to_csv(output_file_path, index=False)

print(f"DataFrame saved to {output_file_path}")

DataFrame saved to new_geo_schools_df.csv


In [54]:
# Read the GeoJSON file into a GeoDataFrame
geo_police_df = gpd.read_file('Police_Stations.geojson')

# List of columns to drop
columns_to_drop = ['geometry', 'RULEID']

# Drop the specified columns
police_loc_df = geo_police_df.drop(columns=columns_to_drop)

# Print the 

police_loc_df


# Make an output for the new dataset
output_file_path= 'police_loc_df.csv'


# Save the DataFrame to a CSV file
police_loc_df.to_csv(output_file_path, index=False)

print(f"DataFrame saved to {output_file_path}")


DataFrame saved to police_loc_df.csv


In [55]:
geo_police_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   OBJECTID          24 non-null     int64   
 1   DISTRICT_NUMBER   24 non-null     int64   
 2   LOCATION          24 non-null     object  
 3   TELEPHONE_NUMBER  24 non-null     object  
 4   RULEID            24 non-null     object  
 5   geometry          24 non-null     geometry
dtypes: geometry(1), int64(2), object(3)
memory usage: 1.3+ KB


In [56]:
police_df

Unnamed: 0,OBJECTID,DISTRICT_NUMBER,LOCATION,TELEPHONE_NUMBER,geometry
0,1,7,Bustleton Ave & Bowler St,686-3070,POINT (-75.03270 40.09098)
1,2,14,Haines St & Germantown Ave,686-3140,POINT (-75.17700 40.03843)
2,3,8,Academy Rd & Red Lion Rd,686-3080,POINT (-74.99730 40.08011)
3,4,2,Harbison Ave & Levick St,686-3020,POINT (-75.06325 40.03149)
4,5,35,N Broad St & Champlost St,686-3350,POINT (-75.14365 40.04450)
5,6,15,Harbison Ave & Levick St,686-3150,POINT (-75.06325 40.03149)
6,7,5,Ridge Ave & Cinnaminson St,686-3050,POINT (-75.22440 40.04020)
7,8,39,22nd St & Hunting Park Ave,686-3390,POINT (-75.16425 40.01090)
8,9,25,3901 Whitaker Ave,686-3250,POINT (-75.12249 40.00874)
9,10,22,17th St & Montgomery Ave,686-3220,POINT (-75.16240 39.98079)


In [57]:

zip_codes = [
    19115,
    19144,
    19114,
    19149,
    19141,
    19149,
    19128,
    19140,
    19124,
    19121,
    19124,
    19125,
    19151,
    19104,
    19130,
    19107,
    19143,
    19146,
    19147,
    19142,
    19145,
    19130,
    19106,
    19104
]

# Step 3: Add the 'Zip Code' column to the DataFrame
police_df['Zip Code'] = zip_codes[:len(police_df)]  # Ensure the length matches

police_df

Unnamed: 0,OBJECTID,DISTRICT_NUMBER,LOCATION,TELEPHONE_NUMBER,geometry,Zip Code
0,1,7,Bustleton Ave & Bowler St,686-3070,POINT (-75.03270 40.09098),19115
1,2,14,Haines St & Germantown Ave,686-3140,POINT (-75.17700 40.03843),19144
2,3,8,Academy Rd & Red Lion Rd,686-3080,POINT (-74.99730 40.08011),19114
3,4,2,Harbison Ave & Levick St,686-3020,POINT (-75.06325 40.03149),19149
4,5,35,N Broad St & Champlost St,686-3350,POINT (-75.14365 40.04450),19141
5,6,15,Harbison Ave & Levick St,686-3150,POINT (-75.06325 40.03149),19149
6,7,5,Ridge Ave & Cinnaminson St,686-3050,POINT (-75.22440 40.04020),19128
7,8,39,22nd St & Hunting Park Ave,686-3390,POINT (-75.16425 40.01090),19140
8,9,25,3901 Whitaker Ave,686-3250,POINT (-75.12249 40.00874),19124
9,10,22,17th St & Montgomery Ave,686-3220,POINT (-75.16240 39.98079),19121
