In [60]:
import pandas as pd 
import numpy as np 

edu = pd.read_csv('../dataset/education.csv')
lang = pd.read_csv('../dataset/languages.csv')
skills = pd.read_csv('../dataset/skills.csv')
exp = pd.read_csv('../dataset/work_experiences.csv')
train = pd.read_csv('../dataset/train_users.csv')
test = pd.read_csv('../dataset/test_users.csv')

### Exploring Education

In [3]:
edu.sort_values(by='user_id').head()

Unnamed: 0,user_id,school_name,degree,fields_of_study,start_year_month,end_year_month
0,0,Anadolu Üniversitesi,,,,
1,0,Adıyaman Üniversitesi,,,,
2,0,Fırat Üniversitesi,,,,
3,1,Fırat Üniversitesi,Yüksek Lisans,Yazılım Mühendisliği,,
4,1,Fırat Üniversitesi,Lisans,Yazılım Mühendisliği,,


In [4]:
edu.isnull().sum() / len(edu) * 100

user_id              0.000000
school_name          0.000000
degree              25.139050
fields_of_study     13.914080
start_year_month    94.247238
end_year_month      94.676486
dtype: float64

In [5]:
edu = edu[['user_id', 'school_name', 'fields_of_study','degree']]

In [6]:
tot = 0
for i in edu.degree.value_counts().tolist():
    tot += i
(edu.degree.str.lower().value_counts() / tot *100)[:10]

lisans derecesi             17.939157
bachelor's degree           14.398546
master's degree              5.034994
yüksek lisans (master)       4.915068
lisans                       2.992514
master of science - ms       2.198008
bachelor of science - bs     1.921617
lise                         1.623678
high school diploma          1.618056
önlisans                     1.517806
Name: degree, dtype: float64

In [7]:
mapping_dict = {
    "lise":["lise", "high school"],
    "Ön Lisans":["ön", "associate","yüksekokul"],
    "Lisans":["lisans", "bachelor","üniversite","bsc", 
              "bs","licentiate","mühendislik","engineer's",
              "undergraduate","i̇şletme fakültesi","fen",
              "b.sc.","fakülte","b.s"],
    "Master":["yüksek", "master","msc","ms","mba","m.sc."],
    "Doktora":["phd", "doctor","doktora"]
}

In [8]:
import re
import pandas as pd

def map_degree(degree):
    if isinstance(degree, float) and pd.isna(degree):
        return degree
    degree = re.sub(r'[\.,;]', '', degree).lower()
    for mapped_degree, keywords in mapping_dict.items():
        for keyword in keywords:
            keyword = keyword.lower()
            if re.search(r"\b" + keyword + r"\b", degree):
                return mapped_degree
    return "diğer"

In [9]:
edu['degree'] = edu['degree'].apply(map_degree)
edu['degree'].value_counts()

Lisans       61984
diğer        17973
Master       15047
lise          5413
Ön Lisans     3830
Doktora       2486
Name: degree, dtype: int64

In [10]:
tot = 0
for i in edu.fields_of_study.value_counts().tolist():
    tot += i
(edu.fields_of_study.str.lower().value_counts() / tot *100)[:50]

bilgisayar mühendisliği                                   11.994753
computer engineering                                      10.383992
elektrik ve elektronik mühendisliği                        2.902140
electrical and electronics engineering                     2.626755
computer science                                           2.557501
bilgisayar programlama                                     1.503214
computer programming                                       1.266937
yönetim bilişim sistemleri                                 1.114578
mechanical engineering                                     1.055916
industrial engineering                                     1.005402
i̇şletme                                                   0.979330
i̇şletme ve yönetim, genel                                 0.956517
business administration and management, general            0.824527
elektronik ve haberleşme mühendisliği                      0.725128
software engineering                            

In [11]:
mapping_study = {
    "computer engineering": [
        "bilgisayar mühendisliği",
        "computer engineering",
        "bilgisayar yazılımı mühendisliği",
        "computer software engineering",
        "computer science and engineering",
        "computer science",
        "computer engineer"
    ],
    "electrical and electronics engineering": [
        "elektrik ve elektronik mühendisliği",
        "electrical and electronics engineering",
        "elektronik ve haberleşme mühendisliği",
        "electronics engineering",
        "electronics and communication engineering",
        "electronics and communications engineering",
        "electrical, electronics and communications engineering",
        "electrical engineering"
    ],
    "business administration and management": [
        "yönetim bilişim sistemleri",
        "i̇şletme",
        "i̇şletme ve yönetim, genel",
        "business administration and management, general",
        "business administration",
        "mba",
        "yönetim bilgi sistemleri, genel",
        "management information systems",
        "management information systems, general"
    ],
    "mechanical engineering": [
        "makine mühendisliği",
        "mechanical engineering"
    ],
    "industrial engineering": [
        "endüstri mühendisliği",
        "industrial engineering"
    ],
    "mathematics": [
        "bilgisayar programlama",
        "mathematics",
        "matematik",
        "science",
        "fen bilimleri",
        "matematik mühendisliği"
    ],
    "computer programming": [
        "computer programming",
        "bilgisayar programlama/programcı, genel",
        "bilgisayar programcılığı"
    ],
    "software engineering": [
        "yazılım mühendisliği",
        "software engineering"
    ],
    "information technology": [
        "information technology"
    ],
    "economics": [
        "economics"
    ],
    "physics": [
        "physics",
        "fizik"
    ],
    "chemical engineering": [
        "kimya mühendisliği",
        "chemical engineering"
    ],
    "engineering management": [
        "engineering management"
    ],
    "civil engineering": [
        "civil engineering"
    ],
    "mechatronics, robotics and automation engineering": [
        "mekatronik, robotik ve otomasyon mühendisliği"
    ]
}

In [12]:
import re
import pandas as pd

def map_study(study):
    if isinstance(study, float) and pd.isna(study):
        return study
    study = re.sub(r'[\.,;]', '', study).lower()
    for mapped, keywords in mapping_study.items():
        for keyword in keywords:
            keyword = keyword.lower()
            if re.search(r"\b" + keyword + r"\b", study):
                return mapped
    return "diğer"

In [13]:
edu['fields_of_study'] = edu['fields_of_study'].apply(map_study)
edu['fields_of_study'].value_counts()

diğer                                     45288
computer engineering                      34693
electrical and electronics engineering    10676
mathematics                                9437
business administration and management     9247
computer programming                       2694
mechanical engineering                     2340
industrial engineering                     2265
software engineering                       1643
physics                                    1295
chemical engineering                        881
economics                                   833
information technology                      721
engineering management                      383
civil engineering                           341
Name: fields_of_study, dtype: int64

In [14]:
edu.head()

Unnamed: 0,user_id,school_name,fields_of_study,degree
0,0,Anadolu Üniversitesi,,
1,0,Adıyaman Üniversitesi,,
2,0,Fırat Üniversitesi,,
3,1,Fırat Üniversitesi,software engineering,Lisans
4,1,Fırat Üniversitesi,software engineering,Lisans


In [15]:
edu.isnull().sum() / len(edu) * 100

user_id             0.00000
school_name         0.00000
fields_of_study    13.91408
degree             25.13905
dtype: float64

In [16]:
edu['fields_of_study'].fillna(value=edu['fields_of_study'].mode()[0], inplace=True)
edu['degree'].fillna(value=edu['degree'].mode()[0], inplace=True)
edu.isnull().sum()

user_id            0
school_name        0
fields_of_study    0
degree             0
dtype: int64

In [17]:
edu = edu.drop_duplicates()
edu['school_name'] = edu['school_name'].str.replace(' ','_')
edu.head()

Unnamed: 0,user_id,school_name,fields_of_study,degree
0,0,Anadolu_Üniversitesi,diğer,Lisans
1,0,Adıyaman_Üniversitesi,diğer,Lisans
2,0,Fırat_Üniversitesi,diğer,Lisans
3,1,Fırat_Üniversitesi,software engineering,Lisans
5,2,Fırat_Üniversitesi,computer engineering,diğer


In [18]:
edu['school_name'].value_counts()[:10]

Anadolu_Üniversitesi             6380
İstanbul_Üniversitesi            4772
Sakarya_Üniversitesi             4254
Kocaeli_Üniversitesi             3741
Istanbul_Technical_University    3726
İstanbul_Teknik_Üniversitesi     3721
Yıldız_Teknik_Üniversitesi       3698
Marmara_Üniversitesi             3515
Hacettepe_Üniversitesi           2936
Ege_Üniversitesi                 2781
Name: school_name, dtype: int64

In [19]:
school = edu.groupby('user_id')["school_name"].apply(" ".join).reset_index()
fields = edu.groupby('user_id')["fields_of_study"].apply(" ".join).reset_index()
degree = edu.groupby('user_id')["degree"].apply(" ".join).reset_index()
edu = pd.merge(pd.merge(school, fields, on='user_id', how='left'), degree, on='user_id', how='left')
edu.reset_index(drop=True)
edu

Unnamed: 0,user_id,school_name,fields_of_study,degree
0,0,Anadolu_Üniversitesi Adıyaman_Üniversitesi Fır...,diğer diğer diğer,Lisans Lisans Lisans
1,1,Fırat_Üniversitesi,software engineering,Lisans
2,2,Fırat_Üniversitesi Hafsa_Sultan_Mesleki_ve_Tek...,computer engineering diğer,diğer lise
3,3,Fırat_Üniversitesi,computer engineering,Lisans
4,4,Fırat_Üniversitesi,software engineering,Lisans
...,...,...,...,...
66266,66269,Boğaziçi_Üniversitesi Boğaziçi_Üniversitesi OML,business administration and management compute...,Master Lisans Lisans
66267,66270,Boğaziçi_Üniversitesi Massachusetts_Institute_...,physics economics mathematics physics,Master diğer Lisans Lisans
66268,66271,Boğaziçi_Üniversitesi_/_Bogazici_University Bo...,mathematics diğer diğer,Master Lisans Doktora
66269,66272,Yildiz_Technical_University Halmstad_Universit...,computer engineering computer engineering diğer,Lisans Lisans Lisans


In [20]:
edu.to_csv('../exported_datasets/edu.csv', index=False)

<hr>

#### Language Exploring

In [21]:
lang.sort_values(by='user_id').head()

Unnamed: 0,user_id,language,proficiency
0,8,İngilizce,full_professional
1,8,Türkçe,native_or_bilingual
2,8,Fransızca,elementary
3,10,ingilizce,
4,11,Turkish,native_or_bilingual


In [22]:
lang.isnull().sum() / len(lang) *100

user_id         0.000000
language        0.000000
proficiency    14.016197
dtype: float64

In [23]:
lang.proficiency.value_counts()

native_or_bilingual     22026
professional_working    18306
elementary               8971
full_professional        8389
limited_working          7709
Name: proficiency, dtype: int64

In [24]:
lang.language.str.lower().value_counts()[:25]

i̇ngilizce                              20702
english                                 14906
türkçe                                  14010
turkish                                  8264
almanca                                  3755
german                                   3108
french                                    934
spanish                                   842
i̇spanyolca                               837
fransızca                                 757
ingilizce                                 661
rusça                                     538
russian                                   521
arabic                                    520
arapça                                    387
italian                                   365
i̇talyanca                                333
i̇ngilizce, orta (1100-1500)              330
japanese                                  276
deutsch                                   203
japonca                                   162
i̇ngilizce                        

In [25]:
mapping_language = {
    "ingilizce": ["ingilzice","english","i̇ngilizce, eski (yaklaşık 450-1100)","i̇ngilizce, orta (1100-1500)"], 
    "türkçe":['türkçe','turkish','türkçe, osmanlıca (1500-1928)'],
    "almanca":['germany','german','almanca','deutch','deutsch'],
    "fransızca":['french','fransızca'],
    "ispanyolca":['spanish','ispanyolca'],
    "rusça":['russian','rusça'],
    "arapça":['arapça','arabian','arabic'],
    "italyanca":['italian','italyanca'],
    "japonca":['japanese','japonca'],
    "azerice":['azerbaijani']
}

In [26]:
def map_lang(lang):
    if isinstance(lang, float) and pd.isna(lang):
        return lang
    lang = re.sub(r'[\.,;]', '', lang).lower()
    for mapped, keywords in mapping_language.items():
        for keyword in keywords:
            keyword = keyword.lower()
            if re.search(r"\b" + keyword + r"\b", lang):
                return mapped
    return "diğer"

In [27]:
lang['language'] = lang['language'].apply(map_lang)
lang['language'].value_counts()

diğer         25809
türkçe        22512
ingilizce     15008
almanca        7184
fransızca      1709
rusça          1066
arapça          935
ispanyolca      865
japonca         442
italyanca       380
azerice         152
Name: language, dtype: int64

In [28]:
lang.head()

Unnamed: 0,user_id,language,proficiency
0,8,diğer,full_professional
1,8,türkçe,native_or_bilingual
2,8,fransızca,elementary
3,10,diğer,
4,11,türkçe,native_or_bilingual


In [29]:
lang.isnull().sum() / len(lang) * 100

user_id         0.000000
language        0.000000
proficiency    14.016197
dtype: float64

In [30]:
most_frequent = lang['proficiency'].mode()[0]
lang['proficiency'].fillna(value=most_frequent, inplace=True)
lang.isnull().sum()

user_id        0
language       0
proficiency    0
dtype: int64

In [31]:
lang.head()

Unnamed: 0,user_id,language,proficiency
0,8,diğer,full_professional
1,8,türkçe,native_or_bilingual
2,8,fransızca,elementary
3,10,diğer,native_or_bilingual
4,11,türkçe,native_or_bilingual


In [32]:
languages = lang.groupby('user_id')["language"].apply(" ".join).reset_index()
proficiencies = lang.groupby('user_id')["proficiency"].apply(" ".join).reset_index()
lang1 = pd.merge(languages, proficiencies, on='user_id', how='left')
lang1.reset_index(drop=True)
lang1

Unnamed: 0,user_id,language,proficiency
0,8,diğer türkçe fransızca,full_professional native_or_bilingual elementary
1,10,diğer,native_or_bilingual
2,11,türkçe ingilizce diğer,native_or_bilingual professional_working nativ...
3,12,türkçe ingilizce,native_or_bilingual professional_working
4,13,diğer almanca,full_professional limited_working
...,...,...,...
37285,66265,ingilizce,native_or_bilingual
37286,66269,ingilizce,native_or_bilingual
37287,66271,ingilizce,native_or_bilingual
37288,66272,ingilizce,professional_working


In [33]:
lang1.to_csv("../exported_datasets/lang.csv", index=False)

<hr>

#### Skills Exploring

In [34]:
skills.sort_values(by='user_id').head()

Unnamed: 0,user_id,skill
0,1,Mühendislik
1,1,Eğitim
2,2,Android
3,2,Java
4,2,3D Studio Max


In [35]:
skills.isnull().sum()

user_id    0
skill      0
dtype: int64

In [36]:
skills['skill'] = skills['skill'].str.replace(' ','_')
skills = skills.groupby('user_id')["skill"].apply(" ".join).reset_index()
skills

Unnamed: 0,user_id,skill
0,1,Mühendislik Eğitim
1,2,Android Java 3D_Studio_Max Müşteri_Hizmetleri ...
2,3,Mühendislik Programlama Algoritma_Tasarımı
3,5,Oyun_Tasarımı Veritabanı_Yönetimi Veri_Bilimi ...
4,6,Liderlik Ekip_Liderliği Mühendislik Eğitim İle...
...,...,...
62397,66269,Cloud_Computing Strategic_Partnerships Enterpr...
62398,66270,PMP Stratejik_Yönetim Stratejik_Bilişim_Yöneti...
62399,66271,Software_Development ASP.NET C# Programming E-...
62400,66272,Java Tomcat MySQL Software_Development Softwar...


In [37]:
skills.to_csv('../exported_datasets/skills.csv', index=False)

<hr>

#### Experiment Exploring

In [38]:
exp.sort_values(by='user_id').head()

Unnamed: 0,user_id,company_id,location,start_year_month
147720,0,0,Serbest Çalışmalar,200509
174454,0,0,Visual Studio Asp.Net Developer,200509
180157,2,10,"Mersin, Turkey",201806
19762,2,7,"Elazig, Turkey",201706
760,2,9,"Elazig, Turkey",201612


In [39]:
exp['location'] = exp['location'].str.lower()
exp['location'].value_counts()[:50]

istanbul, turkey                   44787
ankara, turkey                     14216
i̇stanbul, türkiye                 13176
ankara, türkiye                     4556
izmir, turkey                       4286
kocaeli, turkey                     2420
i̇stanbul                           1917
i̇zmir, türkiye                     1640
turkey                              1570
ankara                              1300
kocaeli, türkiye                    1124
bursa, turkey                       1013
eskisehir, turkey                    915
türkiye                              868
sakarya, turkey                      805
istanbul                             768
manisa, turkey                       678
antalya, turkey                      663
i̇zmir                               632
kayseri, turkey                      479
adana, turkey                        469
eskişehir, türkiye                   388
konya, turkey                        374
gebze                                356
bursa, türkiye  

In [40]:
mapping_loc={
    "istanbul":["istanbul", "istanbul, turkey", "ümraniye",
                "maslak","ümraniye, istanbul, türkiye", "beşiktaş",
                "ataşehir, istanbul, türkiye"],
    "ankara":["ankara", "çankaya, ankara, türkiye"],
    "izmir":["izmir", "izmir, türkiye", "izmir, turkey"],
    "kocaeli":["kocaeli", "gebze, kocaeli"],
    "bursa":["bursa"],
    "eskisehir":["eskisehir", "eskişehir"],
    "sakarya":["sakarya"],
    "antalya":["antalya"],
    "manisa":["manisa"],
    "kayseri":["kayseri"],
    "adana":["adana"],
    "konya":["konya"],
    "trabzon":["trabzon"],
    "mersin":["mersin"],
    "tekirdağ":["tekirdağ", "tekirdağ, türkiye"],
    "mugla":["mugla"],
    "denizli":["denizli"],
    "samsun":["samsun"],
    "elazig":["elazig", "elazığ, türkiye"],
    "US":["us"],
    "isparta":["isparta, turkey"],
    "gaziantep":["gaziantep, turkey"],
    "edirne":["edirne, turkey"],
    "sivas":["sivas, turkey"],
    "kahramanmaras":["kahramanmaras, turkey"]
}

In [41]:
def map_city(city):
    if isinstance(city,float) and pd.isna(city):
        return city
    city = city.lower()
    for mapped, keywords in mapping_loc.items():
        for keyword in keywords:
            keyword = keyword.lower()
            if re.search(r'\b' + keyword + r'\b', city):
                return mapped
    return 'Diğer'

In [42]:
exp['location'] = exp['location'].apply(map_city)
exp['location'].value_counts()

istanbul         47814
Diğer            37339
ankara           20869
izmir             4456
kocaeli           4456
bursa             1529
eskisehir         1505
sakarya           1195
antalya           1039
manisa            1032
kayseri            744
adana              732
konya              687
trabzon            368
mersin             311
elazig             223
tekirdağ           198
denizli            187
mugla              183
samsun             178
isparta            119
gaziantep          112
edirne             111
sivas              109
kahramanmaras       92
US                  17
Name: location, dtype: int64

In [43]:
exp.isnull().sum() / len(exp) *100

user_id              0.000000
company_id           0.000000
location            32.838375
start_year_month     0.000000
dtype: float64

In [44]:
exp['location'].fillna("diğer", inplace=True)
exp.isnull().sum()

user_id             0
company_id          0
location            0
start_year_month    0
dtype: int64

In [45]:
exp.head()

Unnamed: 0,user_id,company_id,location,start_year_month
0,53442,2651,istanbul,201509
1,34558,815,istanbul,201210
2,63761,26354,diğer,200010
3,10738,89,diğer,201610
4,8711,3113,istanbul,201801


In [46]:
exp['start_year_month'] = pd.to_datetime(exp['start_year_month'], format="%Y%m")
exp.head()

Unnamed: 0,user_id,company_id,location,start_year_month
0,53442,2651,istanbul,2015-09-01
1,34558,815,istanbul,2012-10-01
2,63761,26354,diğer,2000-10-01
3,10738,89,diğer,2016-10-01
4,8711,3113,istanbul,2018-01-01


In [47]:
exp.to_csv("../exported_datasets/exp.csv", index=False)

<hr>

In [48]:
train.head()

Unnamed: 0,user_id,industry,location,moved_after_2019
0,1301,Information Technology and Services,"Istanbul, Istanbul, Turkey",1
1,6950,Internet,"Istanbul, Istanbul, Turkey",0
2,4880,Online Media,Turkey,0
3,26046,Telecommunications,"Istanbul, Istanbul, Turkey",0
4,11005,Banking,"Istanbul, Turkey",0


In [50]:
train['location'] = train['location'].str.lower()
train['location'].value_counts()[:50]

istanbul, istanbul, turkey        14835
turkey                            10922
istanbul, turkey                   9615
ankara, turkey                     3736
ankara, ankara, turkey             3418
i̇zmir, turkey                     1526
kadikoy, istanbul, turkey           486
pendik, istanbul, turkey            319
antalya, turkey                     295
maltepe, istanbul, turkey           287
üsküdar, istanbul, turkey           284
kocaeli, turkey                     267
cankaya, ankara, turkey             246
adana, turkey                       243
besiktas, istanbul, turkey          230
kayseri, turkey                     228
sariyer, istanbul, turkey           224
eskişehir, turkey                   223
bursa, turkey                       219
atasehir, istanbul, turkey          219
kartal, istanbul, turkey            218
sisli, istanbul, turkey             214
umraniye, istanbul, turkey          200
konya, turkey                       188
sakarya, turkey                     144


In [54]:
mapping_loc2={
    "istanbul":["istanbul", "istanbul, turkey", "ümraniye",
                "maslak","ümraniye, istanbul, türkiye", "beşiktaş",
                "ataşehir, istanbul, türkiye","istanbul, istanbul, turkey",
                "kadikoy, istanbul, turkey", "pendik, istanbul, turkey",
                "maltepe, istanbul, turkey", "üsküdar, istanbul, turkey",
                "beşiktaş, istanbul, turkey", "besiktas, istanbul, turkey",
                "sariyer, istanbul, turkey", "atasehir, istanbul, turkey",
                "kartal, istanbul, turkey", "sisli, istanbul, turkey",
                "umraniye, istanbul, turkey", "kagithane, istanbul, turkey",
                "bakirkoy, istanbul, turkey", "tuzla, istanbul, turkey",
                "bahcelievler, istanbul, turkey", "beylikduzu, istanbul, turkey",
                "avcilar, istanbul, turkey", "kucukcekmece, istanbul, turkey",
                "greater istanbul", "sancaktepe, istanbul, turkey",
                "beykoz, istanbul, turkey", "fatih, istanbul, turkey",
                "zeytinburnu, istanbul, turkey"],
    "ankara":["ankara", "çankaya, ankara, türkiye",
              "ankara, ankara, turkey", "ankara, turkey",
              "cankaya, ankara, turkey", "yenimahalle, ankara, turkey",
              "altindag, ankara, turkey"],
    "izmir":["izmir", "izmir, türkiye", "izmir, turkey",
             "bornova, izmir, turkey", "izmir, izmir, turkey"],
    "kocaeli":["kocaeli", "gebze, kocaeli", "kocaeli, turkey",
               "izmit, kocaeli, turkey", "gebze, kocaeli, turkey"],
    "bursa":["bursa", "bursa, turkey"],
    "eskisehir":["eskisehir", "eskişehir", "eskisehir, turkey"],
    "sakarya":["sakarya", "sakarya, turkey", "serdivan, sakarya, turkey"],
    "antalya":["antalya", "antalya, turkey"],
    "manisa":["manisa", "manisa, turkey"],
    "kayseri":["kayseri", "kayseri, turkey"],
    "adana":["adana", "adana, turkey"],
    "konya":["konya", "konya, turkey"],
    "trabzon":["trabzon", "trabzon, turkey"],
    "mersin":["mersin", "mersin, turkey", "mercin, içel, turkey"],
    "tekirdağ":["tekirdağ", "tekirdağ, türkiye"],
    "mugla":["mugla", "mugla, turkey", "muğla, turkey"],
    "denizli":["denizli", "denizli, turkey"],
    "samsun":["samsun", "samsun, turkey"],
    "elazig":["elazig", "elazığ, türkiye", "elazig, turkey"],
    "US":["us"],
    "isparta":["isparta, turkey"],
    "gaziantep":["gaziantep, turkey"],
    "edirne":["edirne, turkey"],
    "sivas":["sivas, turkey"],
    "kahramanmaras":["kahramanmaras, turkey"]
}

In [56]:
def map_city2(city):
    if isinstance(city,float) and pd.isna(city):
        return city
    city = city.lower()
    for mapped, keywords in mapping_loc2.items():
        for keyword in keywords:
            keyword = keyword.lower()
            if re.search(r'\b' + keyword + r'\b', city):
                return mapped
    return 'Diğer'

In [58]:
train['location'] = train['location'].str.lower()
train['location'] = train['location'].apply(map_city2)
train['location'].value_counts()

istanbul     28391
Diğer        13521
ankara        7708
kocaeli        557
antalya        387
bursa          292
eskisehir      291
adana          288
sakarya        286
kayseri        269
konya          250
manisa         170
mugla          146
samsun          76
tekirdağ        62
denizli         62
isparta         48
edirne          44
mersin          42
gaziantep       41
trabzon         35
sivas           34
izmir           15
elazig           3
Name: location, dtype: int64

In [61]:
train.to_csv('../exported_datasets/based_train.csv', index=False)

In [62]:
test.head()

Unnamed: 0,user_id,industry,location
0,17449,Research,Turkey
1,33967,Computer Software,"Istanbul, Istanbul, Turkey"
2,2110,Automotive,Turkey
3,55082,Internet,Turkey
4,37165,Electrical/Electronic Manufacturing,Turkey


In [63]:
test['location'] = test['location'].str.lower()
test['location'] = test['location'].apply(map_city2)
test['location'].value_counts()

istanbul     7081
Diğer        3371
ankara       1948
kocaeli       134
antalya       106
kayseri        89
bursa          82
adana          68
sakarya        65
konya          55
eskisehir      49
manisa         44
mugla          42
samsun         23
tekirdağ       18
denizli        15
mersin         12
gaziantep      12
trabzon        11
sivas          11
isparta        10
edirne          7
izmir           2
Name: location, dtype: int64

In [64]:
test.to_csv('../exported_datasets/test.csv', index=False)