In [1]:
import pandas as pd
import pprint
# %pprint
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
data = pd.read_csv('data/survey_results_public_training.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,ResponseId,MainBranch,Age,Employment,RemoteWork,Check,CodingActivities,EdLevel,LearnCode,...,JobSatPoints_6,JobSatPoints_7,JobSatPoints_8,JobSatPoints_9,JobSatPoints_10,JobSatPoints_11,SurveyLength,SurveyEase,ConvertedCompYearly,JobSat
0,51005,51006,I am a developer by profession,55-64 years old,"Employed, full-time","Hybrid (some remote, some in-person)",Apples,Hobby;Bootstrapping a business;Professional de...,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)","On the job training;School (i.e., University, ...",...,20.0,20.0,0.0,20.0,0.0,0.0,Appropriate in length,Easy,,9.0
1,23933,23934,I am a developer by profession,35-44 years old,"Employed, full-time","Hybrid (some remote, some in-person)",Apples,Hobby,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",Books / Physical media;On the job training;Oth...,...,49.0,0.0,0.0,0.0,0.0,0.0,Too long,Difficult,98814.0,9.0
2,59120,59121,I am a developer by profession,25-34 years old,"Employed, full-time",Remote,Apples,Hobby;Contribute to open-source projects;Boots...,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)","Other online resources (e.g., videos, blogs, f...",...,5.0,15.0,15.0,15.0,0.0,5.0,Appropriate in length,Neither easy nor difficult,,3.0
3,1552,1553,I am a developer by profession,25-34 years old,"Employed, full-time",Remote,Apples,Hobby;Contribute to open-source projects;Profe...,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",Books / Physical media;Other online resources ...,...,50.0,0.0,0.0,30.0,0.0,0.0,Appropriate in length,Neither easy nor difficult,83777.0,2.0
4,61241,61242,I am a developer by profession,25-34 years old,"Employed, full-time",In-person,Apples,Hobby;Contribute to open-source projects;Boots...,,,...,,,,,,,,,,


In [3]:
list(data.columns)

['Unnamed: 0',
 'ResponseId',
 'MainBranch',
 'Age',
 'Employment',
 'RemoteWork',
 'Check',
 'CodingActivities',
 'EdLevel',
 'LearnCode',
 'LearnCodeOnline',
 'TechDoc',
 'YearsCode',
 'YearsCodePro',
 'DevType',
 'OrgSize',
 'PurchaseInfluence',
 'BuyNewTool',
 'BuildvsBuy',
 'TechEndorse',
 'Country',
 'Currency',
 'CompTotal',
 'LanguageHaveWorkedWith',
 'LanguageWantToWorkWith',
 'LanguageAdmired',
 'DatabaseHaveWorkedWith',
 'DatabaseWantToWorkWith',
 'DatabaseAdmired',
 'PlatformHaveWorkedWith',
 'PlatformWantToWorkWith',
 'PlatformAdmired',
 'WebframeHaveWorkedWith',
 'WebframeWantToWorkWith',
 'WebframeAdmired',
 'EmbeddedHaveWorkedWith',
 'EmbeddedWantToWorkWith',
 'EmbeddedAdmired',
 'MiscTechHaveWorkedWith',
 'MiscTechWantToWorkWith',
 'MiscTechAdmired',
 'ToolsTechHaveWorkedWith',
 'ToolsTechWantToWorkWith',
 'ToolsTechAdmired',
 'NEWCollabToolsHaveWorkedWith',
 'NEWCollabToolsWantToWorkWith',
 'NEWCollabToolsAdmired',
 'OpSysPersonal use',
 'OpSysProfessional use',
 'Off

[
 
 'MainBranch',
 'Age',
 'Employment',
 'EdLevel',
 'LearnCode',
 'TechDoc',
 'YearsCode',
 'YearsCodePro',
 'DevType',

 'LanguageHaveWorkedWith',

 'DatabaseHaveWorkedWith',

 'PlatformHaveWorkedWith',

 'WebframeHaveWorkedWith',

 'EmbeddedHaveWorkedWith',

 'MiscTechHaveWorkedWith',

 'ToolsTechHaveWorkedWith',

 'NEWCollabToolsHaveWorkedWith',

 'OfficeStackAsyncHaveWorkedWith',

 'OfficeStackSyncHaveWorkedWith',

 'AISearchDevHaveWorkedWith',

 'AISelect', << Target Column
 'AISent',
 'AIBen',
 'AIAcc',
 'AIComplex',
 'AIToolCurrently Using',
 'AIToolInterested in Using',
 'AIToolNot interested in Using',
 'AINextMuch more integrated',
 'AINextNo change',
 'AINextMore integrated',
 'AINextLess integrated',
 'AINextMuch less integrated',
 'AIThreat',
 'AIEthics',
 'AIChallenges',
 
 'WorkExp',
 
 
 'ConvertedCompYearly',
]

In [4]:
clean_data = data[[
 'MainBranch',
 'Age',
 'Employment',
 'EdLevel',
 'YearsCode',
 'YearsCodePro',
 'DevType',
 'LanguageHaveWorkedWith',
 'AISearchDevHaveWorkedWith',
 'AISelect']].dropna()

TARGET_COL = 'AISelect'


In [5]:
def str_to_list(string):
    if isinstance(string, str):
        return string.split(';')
    return []

In [6]:
clean_data['LanguageHaveWorkedWith'] = clean_data['LanguageHaveWorkedWith'].apply(str_to_list)

mlb = MultiLabelBinarizer()
unique_langs_encoded = mlb.fit_transform(clean_data['LanguageHaveWorkedWith'])
lang_choice = pd.DataFrame(unique_langs_encoded.astype(bool), columns=mlb.classes_)
langs_encoded = pd.concat([clean_data.drop(columns='LanguageHaveWorkedWith'), lang_choice], axis=1)

list(langs_encoded.columns)

['MainBranch',
 'Age',
 'Employment',
 'EdLevel',
 'YearsCode',
 'YearsCodePro',
 'DevType',
 'AISearchDevHaveWorkedWith',
 'AISelect',
 'Ada',
 'Apex',
 'Assembly',
 'Bash/Shell (all shells)',
 'C',
 'C#',
 'C++',
 'Clojure',
 'Cobol',
 'Crystal',
 'Dart',
 'Delphi',
 'Elixir',
 'Erlang',
 'F#',
 'Fortran',
 'GDScript',
 'Go',
 'Groovy',
 'HTML/CSS',
 'Haskell',
 'Java',
 'JavaScript',
 'Julia',
 'Kotlin',
 'Lisp',
 'Lua',
 'MATLAB',
 'MicroPython',
 'Nim',
 'OCaml',
 'Objective-C',
 'PHP',
 'Perl',
 'PowerShell',
 'Prolog',
 'Python',
 'R',
 'Ruby',
 'Rust',
 'SQL',
 'Scala',
 'Solidity',
 'Swift',
 'TypeScript',
 'VBA',
 'Visual Basic (.Net)',
 'Zephyr',
 'Zig']

In [7]:
langs_encoded['AISearchDevHaveWorkedWith'].info()

<class 'pandas.core.series.Series'>
Index: 42869 entries, 1 to 30028
Series name: AISearchDevHaveWorkedWith
Non-Null Count  Dtype 
--------------  ----- 
30031 non-null  object
dtypes: object(1)
memory usage: 669.8+ KB


In [8]:
langs_encoded['AISearchDevHaveWorkedWith'] = langs_encoded['AISearchDevHaveWorkedWith'].apply(str_to_list)

unique_aisearch_encoded = mlb.fit_transform(langs_encoded['AISearchDevHaveWorkedWith'])
aisearch_choice = pd.DataFrame(unique_aisearch_encoded.astype(bool), columns=mlb.classes_)
ai_encoded = pd.concat([langs_encoded.drop(columns='AISearchDevHaveWorkedWith'), aisearch_choice], axis=1)

list(ai_encoded.columns)

['MainBranch',
 'Age',
 'Employment',
 'EdLevel',
 'YearsCode',
 'YearsCodePro',
 'DevType',
 'AISelect',
 'Ada',
 'Apex',
 'Assembly',
 'Bash/Shell (all shells)',
 'C',
 'C#',
 'C++',
 'Clojure',
 'Cobol',
 'Crystal',
 'Dart',
 'Delphi',
 'Elixir',
 'Erlang',
 'F#',
 'Fortran',
 'GDScript',
 'Go',
 'Groovy',
 'HTML/CSS',
 'Haskell',
 'Java',
 'JavaScript',
 'Julia',
 'Kotlin',
 'Lisp',
 'Lua',
 'MATLAB',
 'MicroPython',
 'Nim',
 'OCaml',
 'Objective-C',
 'PHP',
 'Perl',
 'PowerShell',
 'Prolog',
 'Python',
 'R',
 'Ruby',
 'Rust',
 'SQL',
 'Scala',
 'Solidity',
 'Swift',
 'TypeScript',
 'VBA',
 'Visual Basic (.Net)',
 'Zephyr',
 'Zig',
 'Amazon Q',
 'Andi',
 'AskCodi',
 'Bing AI',
 'ChatGPT',
 'Claude',
 'Codeium',
 'Cody',
 'GitHub Copilot',
 'Google Gemini',
 'Lightning AI',
 'Meta AI',
 'Metaphor',
 'Neeva AI',
 'OpenAI Codex',
 'Perplexity AI',
 'Phind',
 'Quora Poe',
 'Replit Ghostwriter',
 'Snyk Code',
 'Tabnine',
 'Visual Studio Intellicode',
 'Whispr AI',
 'WolframAlpha',
 'You

In [9]:
ai_encoded['Employment'] = ai_encoded['Employment'].apply(str_to_list)

unique_employment_encoded = mlb.fit_transform(ai_encoded['Employment'])
employment_choice = pd.DataFrame(unique_employment_encoded.astype(bool), columns=mlb.classes_)
employment_encoded = pd.concat([ai_encoded.drop(columns='Employment'), employment_choice], axis=1)

list(employment_encoded.columns)

['MainBranch',
 'Age',
 'EdLevel',
 'YearsCode',
 'YearsCodePro',
 'DevType',
 'AISelect',
 'Ada',
 'Apex',
 'Assembly',
 'Bash/Shell (all shells)',
 'C',
 'C#',
 'C++',
 'Clojure',
 'Cobol',
 'Crystal',
 'Dart',
 'Delphi',
 'Elixir',
 'Erlang',
 'F#',
 'Fortran',
 'GDScript',
 'Go',
 'Groovy',
 'HTML/CSS',
 'Haskell',
 'Java',
 'JavaScript',
 'Julia',
 'Kotlin',
 'Lisp',
 'Lua',
 'MATLAB',
 'MicroPython',
 'Nim',
 'OCaml',
 'Objective-C',
 'PHP',
 'Perl',
 'PowerShell',
 'Prolog',
 'Python',
 'R',
 'Ruby',
 'Rust',
 'SQL',
 'Scala',
 'Solidity',
 'Swift',
 'TypeScript',
 'VBA',
 'Visual Basic (.Net)',
 'Zephyr',
 'Zig',
 'Amazon Q',
 'Andi',
 'AskCodi',
 'Bing AI',
 'ChatGPT',
 'Claude',
 'Codeium',
 'Cody',
 'GitHub Copilot',
 'Google Gemini',
 'Lightning AI',
 'Meta AI',
 'Metaphor',
 'Neeva AI',
 'OpenAI Codex',
 'Perplexity AI',
 'Phind',
 'Quora Poe',
 'Replit Ghostwriter',
 'Snyk Code',
 'Tabnine',
 'Visual Studio Intellicode',
 'Whispr AI',
 'WolframAlpha',
 'You.com',
 'Employ

In [10]:
encoded_data = pd.get_dummies(
    employment_encoded,
    columns=[
        'MainBranch',
        'Age',
        'DevType',
        'EdLevel',
    ]
)

list(encoded_data.columns)

['YearsCode',
 'YearsCodePro',
 'AISelect',
 'Ada',
 'Apex',
 'Assembly',
 'Bash/Shell (all shells)',
 'C',
 'C#',
 'C++',
 'Clojure',
 'Cobol',
 'Crystal',
 'Dart',
 'Delphi',
 'Elixir',
 'Erlang',
 'F#',
 'Fortran',
 'GDScript',
 'Go',
 'Groovy',
 'HTML/CSS',
 'Haskell',
 'Java',
 'JavaScript',
 'Julia',
 'Kotlin',
 'Lisp',
 'Lua',
 'MATLAB',
 'MicroPython',
 'Nim',
 'OCaml',
 'Objective-C',
 'PHP',
 'Perl',
 'PowerShell',
 'Prolog',
 'Python',
 'R',
 'Ruby',
 'Rust',
 'SQL',
 'Scala',
 'Solidity',
 'Swift',
 'TypeScript',
 'VBA',
 'Visual Basic (.Net)',
 'Zephyr',
 'Zig',
 'Amazon Q',
 'Andi',
 'AskCodi',
 'Bing AI',
 'ChatGPT',
 'Claude',
 'Codeium',
 'Cody',
 'GitHub Copilot',
 'Google Gemini',
 'Lightning AI',
 'Meta AI',
 'Metaphor',
 'Neeva AI',
 'OpenAI Codex',
 'Perplexity AI',
 'Phind',
 'Quora Poe',
 'Replit Ghostwriter',
 'Snyk Code',
 'Tabnine',
 'Visual Studio Intellicode',
 'Whispr AI',
 'WolframAlpha',
 'You.com',
 'Employed, full-time',
 'Employed, part-time',
 'I pre

In [11]:
def clean_years(year):
    if year == 'Less than 1 year' or pd.isnull(year):
        return np.int64(0)
    elif year == 'More than 50 years':
        return np.int64(51)
    else:
        return np.int64(year)

# langs_encoded['YearsCode'].unique()
encoded_data['YearsCode'] = encoded_data['YearsCode'].apply(clean_years)
encoded_data['YearsCodePro'] = encoded_data['YearsCodePro'].apply(clean_years)

In [12]:
encoded_data['AISelect'] = encoded_data['AISelect'].map(
    {
        'Yes': True,
        'No, but I plan to soon': False,
        'No, and I don\'t plan to': False,
    }
)

In [13]:
encoded_data = encoded_data.dropna()
# list(encoded_data.columns)
obj_cols = list(encoded_data.select_dtypes(object).columns)
encoded_data[obj_cols] = encoded_data[obj_cols].astype(bool)

In [14]:
pred_cols = list(encoded_data.columns)
pred_cols.remove(TARGET_COL)
y = encoded_data[[TARGET_COL]]
X = encoded_data[pred_cols]

X

Unnamed: 0,YearsCode,YearsCodePro,Ada,Apex,Assembly,Bash/Shell (all shells),C,C#,C++,Clojure,...,DevType_Student,DevType_System administrator,"EdLevel_Associate degree (A.A., A.S., etc.)","EdLevel_Bachelor’s degree (B.A., B.S., B.Eng., etc.)","EdLevel_Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",EdLevel_Primary/elementary school,"EdLevel_Professional degree (JD, MD, Ph.D, Ed.D, etc.)","EdLevel_Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)",EdLevel_Some college/university study without earning a degree,EdLevel_Something else
1,19,8,False,False,False,True,True,False,True,False,...,False,False,False,False,True,False,False,False,False,False
2,13,4,False,False,False,False,False,True,False,False,...,False,False,False,False,True,False,False,False,False,False
3,12,8,False,False,False,True,False,False,True,False,...,False,False,False,False,True,False,False,False,False,False
5,7,4,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
7,16,13,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30023,10,3,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
30024,5,1,False,False,True,True,True,False,True,True,...,False,False,False,True,False,False,False,False,False,False
30025,3,2,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
30029,4,4,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=10)

In [16]:
clf = LinearSVC()
clf.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [17]:
clf.score(X_test, y_test)

0.7758069206164583

In [22]:
sample = {key: False for key in list(encoded_data.select_dtypes(bool).columns)}
sample['YearsCode'] = 0
sample['YearsCodePro'] = 0
sample.pop('AISelect')
prediction = clf.predict([list(sample.values())])
prediction



array([ True])