In [1]:
import numpy as np
import pandas as pd

#For text handling and regular expressions
import re
from sklearn.feature_extraction.text import TfidfVectorizer #For converting text to numerical data

#For computing cosine similarity
from sklearn.metrics.pairwise import linear_kernel

In [2]:
# Load the data
df = pd.read_csv("new_special_schools.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Website,Name,Location (P.O. Box),Address,City/Town,Telephone Number 1,Telephone Number 2,Email Address,ABOUT,Product,Latitude,Longitude
0,0,,ACK Ematundu Boys Secondary School for the Deaf,"76, Khwisero 50135","Khwisero ,P.O Box 76, Khwisero 50135",Khwisero,724569230,,,Vocational school for the deaf is a twin insti...,,,
1,1,,Siuna Community Childrens Home,"2985, Kitale 30200","Opposite Kitale AirportP.O. Box 2985, Kitale 3...",Kitale,722296151,722850117,,,,1.019089,35.002305
2,2,,Homa Bay Childrens Home Academy,"77, Homa Bay 40300","Next to SDA ChurchP.O. Box 77, Homa Bay 40300",Homa Bay,5922312,Not found,,,,-0.535043,34.453097
3,3,,St. Poly Children Centre,"9625, Nairobi 00200","P.O Box 9625, Nairobi 00200","Gatwekera Road, Kibera",723642635,733373409,,,,-1.292066,36.821946
4,4,http://www.stpaulchildrenshome.com,St. Pauls Children Home,,"Masaai Road, off Magadi Road,Ongata Rongai, Ne...",Ongata Rongai,722840089,,mmbote2001@yahoo.com,Providing basic needs to the children 2-18; in...,,-1.360855,36.729176


In [3]:
# Fill NaN values with empty strings
df.fillna('',inplace=True)
df.head()

Unnamed: 0.1,Unnamed: 0,Website,Name,Location (P.O. Box),Address,City/Town,Telephone Number 1,Telephone Number 2,Email Address,ABOUT,Product,Latitude,Longitude
0,0,,ACK Ematundu Boys Secondary School for the Deaf,"76, Khwisero 50135","Khwisero ,P.O Box 76, Khwisero 50135",Khwisero,724569230,,,Vocational school for the deaf is a twin insti...,,,
1,1,,Siuna Community Childrens Home,"2985, Kitale 30200","Opposite Kitale AirportP.O. Box 2985, Kitale 3...",Kitale,722296151,722850117,,,,1.019089,35.002305
2,2,,Homa Bay Childrens Home Academy,"77, Homa Bay 40300","Next to SDA ChurchP.O. Box 77, Homa Bay 40300",Homa Bay,5922312,Not found,,,,-0.535043,34.453097
3,3,,St. Poly Children Centre,"9625, Nairobi 00200","P.O Box 9625, Nairobi 00200","Gatwekera Road, Kibera",723642635,733373409,,,,-1.292066,36.821946
4,4,http://www.stpaulchildrenshome.com,St. Pauls Children Home,,"Masaai Road, off Magadi Road,Ongata Rongai, Ne...",Ongata Rongai,722840089,,mmbote2001@yahoo.com,Providing basic needs to the children 2-18; in...,,-1.360855,36.729176


In [4]:

df.dropna(subset=['Website','Name','Location (P.O. Box)','Address','City/Town','Telephone Number 1','Telephone Number 2', 'Email Address','ABOUT','Product','Latitude','Longitude'], inplace=True,axis=0)

In [5]:
df = df.reset_index(drop=True)

# Clean the text data using regex
df['Website'] = [re.sub(r'[^\w\s]','',t) for t in df['Website']]
df['Name'] = [re.sub('.','',re.sub('','',t)) for t in df['Name']]
df['Location (P.O. Box)'] = [re.sub(',','',re.sub('','',t)) for t in df['Location (P.O. Box)']]
df['Address'] = [re.sub(r'[^\w\s]','',t) for t in df['Address']]
df['City/Town'] = [re.sub(',','',re.sub('','',t)) for t in df['City/Town']]
df['Email Address'] = [re.sub(r'[^\w\s]','',t) for t in df['Email Address']]
df['ABOUT'] = [re.sub(r'[^\w\s]','',t) for t in df['ABOUT']]
df['Product'] = [re.sub(r'[^\w\s]','',t) for t in df['Product']]

In [6]:
# Combine all relevant columns into a single string
df["combined"] = (df['Website']+''+df['Name']+''+
                  df['Location (P.O. Box)']+''+df['Address']+''+
                  df['City/Town']+''+df['Telephone Number 1']+''+
                  df['Telephone Number 2']+''+df['Email Address']+''+
                  df['ABOUT']+''+df['Product']+''+
                  df['Latitude'].astype(str)+''+df['Longitude'].astype(str))

# Drop unnecessary columns
df.drop(['Website','Name','Location (P.O. Box)','Address',
         'City/Town','Telephone Number 1','Telephone Number 2',
         'Email Address','ABOUT','Product','Latitude','Longitude'],
        axis=1, inplace=True)

# Display the head of the modified DataFrame
print(df.head())

   Unnamed: 0                                           combined
0           0  76 Khwisero 50135Khwisero PO Box 76 Khwisero 5...
1           1  2985 Kitale 30200Opposite Kitale AirportPO Box...
2           2  77 Homa Bay  40300Next to SDA ChurchPO Box 77 ...
3           3  9625 Nairobi 00200PO Box 9625 Nairobi 00200Gat...
4           4  httpwwwstpaulchildrenshomecomMasaai Road off M...


In [7]:
# Create a TF-IDF Vectorizer and fit_transform the combined text
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined'])

In [8]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [9]:
# Function to recommend schools
def recommend_school(school_index, num_recommendations=5):
    # Get the pairwise similarity scores of all schools with that school
    sim_scores = list(enumerate(cosine_sim[school_index]))
    # Sort the schools based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores of the most similar schools
    sim_scores = sim_scores[1:num_recommendations + 1]
    # Get the school indices
    school_indices = [i[0] for i in sim_scores]
    # Return the top recommended schools
    return df.iloc[school_indices]

In [10]:
# Example usage: recommend schools similar to the first school (index 0)
recommended_schools = recommend_school(0)
print(recommended_schools)

    Unnamed: 0                                           combined
33          33  wwwthegranevilleschoolacke0867 Nairobi 00100Ma...
22          22  httpWwwstodafortheviacke26 Maseno 40105Aluor a...
20          20  163 Kapenguria 30600KapenguriaPO Box 163 Kapen...
11          11  84 Vihiga  50310Chekombero Special School for ...
19          19  httpdeafchildrensocietykenyaorg42306 Nairobi 0...
