### Load the data

In [None]:
import plotly.express as px
import math
import pandas as pd


In [59]:
url = 'https://github.com/Lambda-School-Labs/bridges-to-prosperity-ds-d/blob/main/Data/B2P%20Dataset_2020.10.xlsx?raw=true'
df = pd.read_excel(url, sheet_name='Data')

### Define the target

In [None]:
# Any sites with a "Yes" in the column "Senior Engineering Review Conducted"
# have undergone a full technical review, and of those, the 
# "Bridge Opportunity: Stage" column can be considered to be correct.
positive = (
    (df['Senior Engineering Review Conducted']=='Yes') & 
    (df['Bridge Opportunity: Stage'].isin(['Complete', 'Prospecting', 'Confirmed']))
)

negative = (
    (df['Senior Engineering Review Conducted']=='Yes') & 
    (df['Bridge Opportunity: Stage'].isin(['Rejected', 'Cancelled']))
)

# Any sites without a "Yes" in column Senior Engineering Review Conducted" 
# have not undergone a full technical review ...
# So these sites are unknown and unlabeled
unknown = df['Senior Engineering Review Conducted'].isna()

# Create a new column named "Good Site." This is the target to predict.
# Assign a 1 for the positive class and 0 for the negative class.
df.loc[positive, 'Good Site'] = 1
df.loc[negative, 'Good Site'] = 0

# Assign -1 for unknown/unlabled observations. 
# Scikit-learn's documentation for "Semi-Supervised Learning" says, 
# "It is important to assign an identifier to unlabeled points ...
# The identifier that this implementation uses is the integer value -1."
# We'll explain this soon!
df.loc[unknown, 'Good Site'] = -1

df['Good Site'].value_counts(dropna=False)

-1.0    1383
 1.0      65
 0.0      24
Name: Good Site, dtype: int64

### Drop columns used to derive the target

### Look at target's distribution

In [None]:
df['Good Site'].value_counts()

-1.0    1383
 1.0      65
 0.0      24
Name: Good Site, dtype: int64

### Wrangle the data

In [None]:
import numpy as np

def wrangle(X):

  X['Bridge classification'] = X['Bridge classification'].replace({np.nan: "unknown"})

  X['Height differential between banks'] = X['Height differential between banks'].replace({np.nan: 10})

  X['Cell service quality'] = X['Cell service quality'].replace({np.nan: "unknown"})

  X['4WD Accessibility'] = X['4WD Accessibility'].replace({np.nan: "unknown"})

  X['Bridge Type'] = X['Bridge Type'].replace({np.nan: "unknown"})
  
  X['Estimated span (m)'] = X['Estimated span (m)'].replace({np.nan: X['Estimated span (m)'].median()})

  X['Days per year river is flooded'] = X['Days per year river is flooded'].replace({np.nan: X['Days per year river is flooded'].median()})

  X['River crossing deaths in last 3 years'] = X['River crossing deaths in last 3 years'].replace({np.nan: X['River crossing deaths in last 3 years'].median()})

  X['River crossing injuries in last 3 years'] = X['River crossing injuries in last 3 years'].replace({np.nan: X['River crossing injuries in last 3 years'].median()})
  
  X['Proposed Bridge Location (GPS) (Latitude)'] = X['Proposed Bridge Location (GPS) (Latitude)'].replace({np.nan: X['Proposed Bridge Location (GPS) (Latitude)'].median()})

  X['Proposed Bridge Location (GPS) (Longitude)'] = X['Proposed Bridge Location (GPS) (Longitude)'].replace({np.nan: X['Proposed Bridge Location (GPS) (Longitude)'].median()})


  crossing = []
  for i in X['Current crossing method']:
    if type(i) == float:
      crossing.append("unknown")
    elif 'timber' in i.lower() or 'log' in i.lower():
      crossing.append('timber')
    elif 'boat' in i.lower():
      crossing.append('boat')
    else:
      crossing.append(i.lower())
  X['crossing'] = crossing
  
  return X

### Make a semi-supervized model and use it generate labels for unknown data points

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.semi_supervised import LabelSpreading


target = 'Good Site'

features = ['Bridge classification', 'crossing', 'Days per year river is flooded',
            'River crossing deaths in last 3 years', 'River crossing injuries in last 3 years',
            'Cell service quality', '4WD Accessibility', 'Bridge Type',
            'Proposed Bridge Location (GPS) (Latitude)', 'Proposed Bridge Location (GPS) (Longitude)',
            'Height differential between banks', 'Estimated span (m)']


labels = df[target]
X = wrangle(df)
X = X[features]
enc = OrdinalEncoder()


enc.fit(X)

X = enc.transform(X)

label_spread = LabelSpreading(kernel='knn', alpha=0.8)
label_spread.fit(X, labels)

output_labels = label_spread.transduction_

df['knn'] = output_labels
df['knn'].value_counts(normalize=True)



max_iter=30 was reached without convergence.


invalid value encountered in true_divide



1.0    0.650136
0.0    0.349864
Name: knn, dtype: float64

#Build visualization for KNN model 

###create a variable that contains the probabilities as arrays

In [None]:
#create variable containing probabilities 
double_array = label_spread.predict_proba(X)

In [None]:
#get rid of spaces in column headers
df.columns = df.columns.str.replace(" ", "")
#create a dictionary 
d = {'Project_code': df['BridgeOpportunity:ProjectCode'], 'Unsuitable': double_array[:,0], 'Suitable': double_array[:,1], 
     'Bridge_classification': df['Bridgeclassification'], 'Crossing': df['crossing'], 'Days_per_year_river_is_flooded': df['Daysperyearriverisflooded'],
     'River_crossing_injuries_in_last_3_years': df['Rivercrossingdeathsinlast3years'], 'Cell_service_quality': df['Cellservicequality'],
     '4WD_Accessibility': df['4WDAccessibility'],'Bridge_Type': df['BridgeType'],'Proposed_Bridge_Location(GPS)(Latitude)': df['ProposedBridgeLocation(GPS)(Latitude)'],
     'Proposed_Bridge_Location(GPS)(Longitude)': df['ProposedBridgeLocation(GPS)(Longitude)'],'Height_differential_between_banks': df['Heightdifferentialbetweenbanks'],
     'Estimated_span(m)': df['Estimatedspan(m)']}
#convert to dataframe 
viz_df = pd.DataFrame(data=d)
viz_df

###This function was created in order to take a project code and an empty dictionary and populate it with it nearest neighbors according to the project code 

In [None]:
def nearest_neighborz(project_code, common):

  row = viz_df[viz_df['Project_code'] ==project_code]
  prob_0 = row['Unsuitable'] 
  prob_1 = row['Suitable']
               
  for index,row in viz_df.iterrows():
    if (math.isclose(row['Unsuitable'], prob_0,rel_tol=.02) == True) & (math.isclose(row['Suitable'], prob_1,rel_tol=.02) == True):
      common[project_code].add(row["Project_code"])
  return common

In [None]:
#this love appends all the project codes to an empty dict as the key and makes the value a set()
neig = {}
for i in viz_df['Project_code']: 
  neig[i] = set()
#now we use another for loop that utilizes our nearest neighborz
# function and populates the empty dict neig
for i in viz_df['Project_code']: 
  nearest_neighborz(i, neig)
#delete all values that don't have neighbors 
for k,v in list(neig.items()):
    if len(v) == 0:
       del neig[k]
neig

In [55]:
#convert dict to dataframe in order to export it for fastapi 
neighbors_df = pd.DataFrame(list(neig.values()), index=neig.keys())
neighbors_df[0]

1009317    1014035
1007325    1013418
1007466    1007466
1007495    1007495
1007507    1013286
            ...   
1007481    1007503
1014031    1007422
1014056    1013551
1014069    1013499
1015561    1014327
Name: 0, Length: 1418, dtype: object

In [56]:
neighbors_data  = neighbors_df.reset_index() #reset index 

In [None]:
neighbors_data.columns = ['main', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
       '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23',
       '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35',
       '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47',
       '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59',
       '60', '61']
neighbors_data

In [None]:
def knn_viz(project_code):

  project_code_row = neighbors_data.loc[neighbors_data['main'] == f"{project_code}"]
  transformed_project_code = project_code_row.T.dropna()
  transformed_project_code.columns = ["PG"]
  transformed_project_code["PG"] = transformed_project_code["PG"].astype(int)
  transformed_project_code["PG"] = transformed_project_code["PG"].astype(str)
  lst =list(transformed_project_code["PG"])
  knn_df = viz_df.loc[viz_df['Project_code'].isin(lst)]
  knn_df["main"] = np.where(knn_df['Project_code']==f'{project_code}', 'selected', 'neighbors')
 
  fig = px.scatter(knn_df, x='Unsuitable', y='Suitable',
                    hover_data=['Project_code','Suitable','Unsuitable', 'Bridge_classification',
                              'Crossing', 'Days_per_year_river_is_flooded',
                                'River_crossing_injuries_in_last_3_years', 'Cell_service_quality',
                                '4WD_Accessibility', 'Bridge_Type',
                                'Proposed_Bridge_Location(GPS)(Latitude)',
                                'Proposed_Bridge_Location(GPS)(Longitude)',
                                'Height_differential_between_banks', 
                                'Estimated_span(m)'], color = knn_df["main"]
                              , color_discrete_map={"selected":"#F6D143",
                                                    "neighbors": "#79EC7D"}
                  )
  return fig.show()

In [60]:
print(knn_viz('1013551'))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



None


###Take dataframe with prediction labels and convert it to csv in order to add to the postgres database

In [None]:
from google.colab import drive
drive.mount('drive')


Mounted at drive


In [None]:
df.shape

(1472, 48)

In [None]:
viz_df.to_csv('viz_data.csv')
!cp data.csv "drive/My Drive/"

cp: cannot stat 'data.csv': No such file or directory
