In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [27]:
###### helper functions. Use them when needed #######
def get_title_from_index(index):
	return df[df.index == index]["jop_tital"].values[0]

def get_index_from_title(title):
	return df[df.jop_tital == title].index

In [3]:
##Step 1: Read CSV File
df = pd.read_csv("Wuzzuf WebScraper Dataset.csv")
df.head()

Unnamed: 0,company_name,company_location,date,dateNum,jop_tital,jop_type,jop_level,min_exp,max_exp,related_jop_titel,strongKeyWord,specificWorde,description
0,Globaction,"6th of October, Giza, Egypt",4 days ago,4,Data Scientist Engineer,Full Time,Entry Level,1,1,ITSoftware Development Engineering - TelecomTe...,Data Science data,1+ Yrs of Exp Data Science data analyst analyst,Data Science 5S Lean Manufacturing Methodology...
1,aliaict,"Nasr City, Cairo, Egypt",5 days ago,5,Senior Data Analyst,Full Time,Experienced,2,4,ITSoftware Development AnalystResearch Enginee...,Data Data Science Science,2 - 4 Yrs of Exp Data Analysis Analysis Data S...,Data Analysis Data Science Information Managem...
2,Carina Wear,"New Cairo, Cairo, Egypt",6 hours ago,6,Data Analyst,Full Time,Entry Level,1,3,Human Resources AnalystResearch,Data Science Data,1 - 3 Yrs of Exp Data Analysis Analysis Comput...,Data Analysis Microsoft Power BI business Anal...
3,Othaim,"Katameya, Cairo, Egypt",3 days ago,3,Data Analyst,Full Time,Experienced,3,15,ITSoftware Development AnalystResearch,Science Data,3 - 15 Yrs of Exp Computer Science Computer Da...,Computer Science Information Technology (IT) D...
4,Partner & More,"6th of October, Giza, Egypt",10 days ago,10,Data Analyst - 1-3 years exp.,Full Time,Entry Level,1,3,ITSoftware Development AnalystResearch Enginee...,Data Data Science,1 - 3 Yrs of Exp Data Analysis Analysis Data C...,Data Analysis Analysis Data Computer Science I...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 13 columns):
company_name         75 non-null object
company_location     75 non-null object
date                 75 non-null object
dateNum              75 non-null int64
jop_tital            75 non-null object
jop_type             75 non-null object
jop_level            75 non-null object
min_exp              75 non-null int64
max_exp              75 non-null int64
related_jop_titel    75 non-null object
strongKeyWord        64 non-null object
specificWorde        75 non-null object
description          75 non-null object
dtypes: int64(3), object(10)
memory usage: 7.7+ KB


In [5]:
df.shape

(75, 13)

In [6]:
df.isna().sum()

company_name          0
company_location      0
date                  0
dateNum               0
jop_tital             0
jop_type              0
jop_level             0
min_exp               0
max_exp               0
related_jop_titel     0
strongKeyWord        11
specificWorde         0
description           0
dtype: int64

In [8]:
df.fillna('',inplace=True)

In [13]:
df.drop_duplicates(inplace=True)

In [14]:
##Step 2: Select Features
features = ['jop_type','jop_level','related_jop_titel','strongKeyWord','specificWorde','description']

In [16]:
##Step 3: Create a column in DF which combines all selected features
for feature in features:
	df[feature] = df[feature].fillna('')

def combine_features(row):
	try:
		return row['jop_type'] +" "+row['jop_level']+" "+row["related_jop_titel"]+" "+row["strongKeyWord"]+" "+row["specificWorde"]+" "+row["description"]
	except:
		print ("Error:", row)


In [17]:
df["combined_features"] = df.apply(combine_features,axis=1)

df["combined_features"].head()

0    Full Time Entry Level ITSoftware Development E...
1    Full Time Experienced ITSoftware Development A...
2    Full Time Entry Level Human Resources AnalystR...
3    Full Time Experienced ITSoftware Development A...
4    Full Time Entry Level ITSoftware Development A...
Name: combined_features, dtype: object

In [18]:
##Step 4: Create count matrix from this new combined column
cv = CountVectorizer()

count_matrix = cv.fit_transform(df["combined_features"])

In [19]:
##Step 5: Compute the Cosine Similarity based on the count_matrix
cosine_sim = cosine_similarity(count_matrix)

In [20]:
cosine_sim

array([[1.        , 0.67891242, 0.5629605 , ..., 0.28200477, 0.31627658,
        0.34314859],
       [0.67891242, 1.        , 0.74655171, ..., 0.59313988, 0.59834934,
        0.56966184],
       [0.5629605 , 0.74655171, 1.        , ..., 0.36887783, 0.34564003,
        0.33750618],
       ...,
       [0.28200477, 0.59313988, 0.36887783, ..., 1.        , 0.75987664,
        0.80944875],
       [0.31627658, 0.59834934, 0.34564003, ..., 0.75987664, 1.        ,
        0.73830454],
       [0.34314859, 0.56966184, 0.33750618, ..., 0.80944875, 0.73830454,
        1.        ]])

In [37]:
## Step 6: Get index of this movie from its title
movie_user_likes = "Data Analyst"

movie_index = get_index_from_title(movie_user_likes)[0]

similar_movies =  list(enumerate(cosine_sim[movie_index]))

In [38]:
similar_movies

[(0, 0.5629604954667469),
 (1, 0.7465517058725967),
 (2, 1.0000000000000004),
 (3, 0.7348469228349536),
 (4, 0.8140915784106942),
 (5, 0.4617538752396487),
 (6, 0.5685352436149612),
 (7, 0.7533990064322368),
 (8, 0.47765850230505835),
 (9, 0.6489329914661761),
 (10, 0.7515416254704821),
 (11, 0.49330001764383435),
 (12, 0.28090032386679487),
 (13, 0.7804368548602442),
 (14, 0.541918583574065),
 (15, 0.3849001794597505),
 (16, 0.7017932675554326),
 (17, 0.7076303701373626),
 (18, 0.3161680045562236),
 (19, 0.37569568111143015),
 (20, 0.5760322529511887),
 (21, 0.5368773709008973),
 (22, 0.4781130276875432),
 (23, 0.4931533549328053),
 (24, 0.36493249912882486),
 (25, 0.33685503477268464),
 (26, 0.338401020306769),
 (27, 0.3638438983717913),
 (28, 0.3415004757769866),
 (29, 0.3543917920327852),
 (30, 0.350329236163592),
 (31, 0.26419797463373906),
 (32, 0.3638438983717913),
 (33, 0.32631500345752024),
 (34, 0.39749814964666297),
 (35, 0.41079191812887467),
 (36, 0.3638438983717913),
 (37

In [39]:
## Step 7: Get a list of similar movies in descending order of similarity score
sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)

In [40]:
sorted_similar_movies

[(2, 1.0000000000000004),
 (4, 0.8140915784106942),
 (13, 0.7804368548602442),
 (7, 0.7533990064322368),
 (10, 0.7515416254704821),
 (1, 0.7465517058725967),
 (3, 0.7348469228349536),
 (17, 0.7076303701373626),
 (16, 0.7017932675554326),
 (9, 0.6489329914661761),
 (20, 0.5760322529511887),
 (6, 0.5685352436149612),
 (0, 0.5629604954667469),
 (14, 0.541918583574065),
 (21, 0.5368773709008973),
 (37, 0.49892218025118684),
 (11, 0.49330001764383435),
 (23, 0.4931533549328053),
 (22, 0.4781130276875432),
 (8, 0.47765850230505835),
 (5, 0.4617538752396487),
 (50, 0.4392052305789415),
 (35, 0.41079191812887467),
 (34, 0.39749814964666297),
 (62, 0.39464001411506755),
 (67, 0.3930042298310422),
 (54, 0.3912379450278686),
 (61, 0.3852084696008535),
 (15, 0.3849001794597505),
 (40, 0.3772776389454068),
 (19, 0.37569568111143015),
 (43, 0.3711961693228115),
 (72, 0.3688778299954577),
 (38, 0.3688626719822609),
 (24, 0.36493249912882486),
 (27, 0.3638438983717913),
 (32, 0.3638438983717913),
 (36

In [42]:
## Step 8: Print titles of first 50 movies
i=0
for element in sorted_similar_movies:
		print(get_title_from_index(element[0]))
		i=i+1
		if i>10:
			break

Data Analyst
Data Analyst - 1-3 years exp.
Data Analyst (Tableau Developer)
Data Analyst
Data Analyst
Senior Data Analyst
Data Analyst
Sr Business Analyst
Business Analyst
Data Analyst
Senior ETL Developer
