# Survey analysis
Analyse the results from the [languages survey](https://forms.gle/5b3mZRVcgAsoNG1FA)

In [1]:
all_langs = ['Python', 'Java', 'JavaScript', 'TypeScript', 'PHP', 'C', 'C++', 'C#',
             'Ruby', 'R', 'Matlab', 'Go', 'Rust', 'Objective-C', 'Swift',
             'Visual Basic', 'Perl', 'Cobol', 'Fortran', 'Lisp', 'Assembly']

# Load data

In [2]:
from pprint import pprint
import pandas as pd

df = pd.read_csv('../data/Programming language survey.csv', header=0, names=('timestamp', 'languages', 'other_langs', 'age'), usecols=('languages', 'other_langs', 'age'))
    
df.head()

Unnamed: 0,languages,other_langs,age
0,Python;Java;C;C++;R;Matlab;Rust;Fortran;Assembly,"awk,Julia",50 - 59
1,Python;Java;PHP;Visual Basic,,40 - 49
2,Python;Java;JavaScript;C;C++;Ruby;R;Objective-...,,30 - 39
3,Python;Java;JavaScript;C;C++;C#;R;Perl,,50 - 59
4,JavaScript;C;C#;Objective-C;Visual Basic,,30 - 39


In [3]:
import re

for lang in all_langs:
    regex = rf'(^|;){re.escape(lang)}($|;)'
    df[lang] = df['languages'].str.contains(regex)
df

  df[lang] = df['languages'].str.contains(regex)


Unnamed: 0,languages,other_langs,age,Python,Java,JavaScript,TypeScript,PHP,C,C++,...,Go,Rust,Objective-C,Swift,Visual Basic,Perl,Cobol,Fortran,Lisp,Assembly
0,Python;Java;C;C++;R;Matlab;Rust;Fortran;Assembly,"awk,Julia",50 - 59,True,True,False,False,False,True,True,...,False,True,False,False,False,False,False,True,False,True
1,Python;Java;PHP;Visual Basic,,40 - 49,True,True,False,False,True,False,False,...,False,False,False,False,True,False,False,False,False,False
2,Python;Java;JavaScript;C;C++;Ruby;R;Objective-...,,30 - 39,True,True,True,False,False,True,True,...,False,False,True,False,False,False,False,False,False,True
3,Python;Java;JavaScript;C;C++;C#;R;Perl,,50 - 59,True,True,True,False,False,True,True,...,False,False,False,False,False,True,False,False,False,False
4,JavaScript;C;C#;Objective-C;Visual Basic,,30 - 39,False,False,True,False,False,True,False,...,False,False,True,False,True,False,False,False,False,False
5,Python;Java;Perl,SQL,50 - 59,True,True,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
6,Python;Java;C++,,40 - 49,True,True,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
7,Python;JavaScript;Go,SQL (arguably also considered a programming la...,30 - 39,True,False,True,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
8,Python;Matlab,,20 - 29,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,Python;Java;C;C#;Lisp,,50 - 59,True,True,False,False,False,True,False,...,False,False,False,False,False,False,False,False,True,False


## Find number of languages known
Print `"{# known by class} / {# in list} languages known by this class (as %)"`.

E.g. **12/21 languages known by this class (57%)**

In [4]:
known_langs = df.columns[df.eq(True).any()].tolist()
known_langs

['Python',
 'Java',
 'JavaScript',
 'TypeScript',
 'PHP',
 'C',
 'C++',
 'C#',
 'Ruby',
 'R',
 'Matlab',
 'Go',
 'Rust',
 'Objective-C',
 'Swift',
 'Visual Basic',
 'Perl',
 'Cobol',
 'Fortran',
 'Lisp',
 'Assembly']

In [5]:
percent = round(len(known_langs)/len(all_langs) * 100)
print(f"{len(known_langs)}/{len(all_langs)} languages known by this class ({percent}%)")

21/21 languages known by this class (100%)


## List languages not known by anyone in the class

In [6]:
print('Not known:')
', '.join(df.columns[df.eq(False).all()].tolist())

Not known:


''

# Rank languages by most commonly known
Print each language as `"{position}: {language} ({count})"`, in order from most to least known

e.g. **1: Python (30)**

In [7]:
langs_count = df.iloc[:, 3:].sum()
langs_count

Python          47
Java            19
JavaScript      16
TypeScript       6
PHP              5
C               22
C++             19
C#               9
Ruby             5
R               12
Matlab           7
Go               5
Rust             3
Objective-C      2
Swift            1
Visual Basic    13
Perl            15
Cobol            4
Fortran          7
Lisp             5
Assembly         9
dtype: int64

In [8]:
langs_count.sort_values(ascending=False, inplace=True)
langs_count

Python          47
C               22
C++             19
Java            19
JavaScript      16
Perl            15
Visual Basic    13
R               12
Assembly         9
C#               9
Fortran          7
Matlab           7
TypeScript       6
Ruby             5
Go               5
PHP              5
Lisp             5
Cobol            4
Rust             3
Objective-C      2
Swift            1
dtype: int64

In [9]:
for i, (lang, count) in enumerate(langs_count.iteritems(), start=1):
    print(f'{i}: {lang} ({count})')

1: Python (47)
2: C (22)
3: C++ (19)
4: Java (19)
5: JavaScript (16)
6: Perl (15)
7: Visual Basic (13)
8: R (12)
9: Assembly (9)
10: C# (9)
11: Fortran (7)
12: Matlab (7)
13: TypeScript (6)
14: Ruby (5)
15: Go (5)
16: PHP (5)
17: Lisp (5)
18: Cobol (4)
19: Rust (3)
20: Objective-C (2)
21: Swift (1)


## Bonus: rank languages known by age group

In [10]:
age_ranges = ['<= 19', '20 - 29', '30 - 39', '40 - 49', '50 - 59', '>= 60', 'Unknown']

df2 = df
df2['age'] = pd.Categorical(df['age'], categories=age_ranges)
df2['age'].fillna("Unknown", inplace=True)
df2['total'] = 1
df2

Unnamed: 0,languages,other_langs,age,Python,Java,JavaScript,TypeScript,PHP,C,C++,...,Rust,Objective-C,Swift,Visual Basic,Perl,Cobol,Fortran,Lisp,Assembly,total
0,Python;Java;C;C++;R;Matlab;Rust;Fortran;Assembly,"awk,Julia",50 - 59,True,True,False,False,False,True,True,...,True,False,False,False,False,False,True,False,True,1
1,Python;Java;PHP;Visual Basic,,40 - 49,True,True,False,False,True,False,False,...,False,False,False,True,False,False,False,False,False,1
2,Python;Java;JavaScript;C;C++;Ruby;R;Objective-...,,30 - 39,True,True,True,False,False,True,True,...,False,True,False,False,False,False,False,False,True,1
3,Python;Java;JavaScript;C;C++;C#;R;Perl,,50 - 59,True,True,True,False,False,True,True,...,False,False,False,False,True,False,False,False,False,1
4,JavaScript;C;C#;Objective-C;Visual Basic,,30 - 39,False,False,True,False,False,True,False,...,False,True,False,True,False,False,False,False,False,1
5,Python;Java;Perl,SQL,50 - 59,True,True,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,1
6,Python;Java;C++,,40 - 49,True,True,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,1
7,Python;JavaScript;Go,SQL (arguably also considered a programming la...,30 - 39,True,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
8,Python;Matlab,,20 - 29,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
9,Python;Java;C;C#;Lisp,,50 - 59,True,True,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,1


In [11]:
by_age = df2.groupby('age').sum()
by_age

Unnamed: 0_level_0,Python,Java,JavaScript,TypeScript,PHP,C,C++,C#,Ruby,R,...,Rust,Objective-C,Swift,Visual Basic,Perl,Cobol,Fortran,Lisp,Assembly,total
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
<= 19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20 - 29,2,0,1,1,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,2
30 - 39,10,3,3,0,0,4,3,1,1,2,...,0,2,0,4,1,0,0,1,2,11
40 - 49,11,6,6,2,2,4,4,2,0,1,...,0,0,0,4,2,0,0,0,0,13
50 - 59,18,8,5,3,2,11,8,4,4,6,...,3,0,1,2,10,2,4,4,4,18
>= 60,6,2,1,0,1,3,3,2,0,2,...,0,0,0,3,2,2,3,0,3,6
Unknown,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
for index in by_age.index:
    num_respondents = by_age.loc[index, 'total']
    print(f"For {index} ({num_respondents})")
    if num_respondents == 0:
        print('  No data')
    else:
        print(by_age.loc[index, by_age.loc[index] >= 1].sort_values(ascending=False).to_string())
    print()

For <= 19 (0)
  No data

For 20 - 29 (2)
Python        2
total         2
JavaScript    1
TypeScript    1
C++           1
R             1
Matlab        1

For 30 - 39 (11)
total           11
Python          10
C                4
Visual Basic     4
Java             3
JavaScript       3
C++              3
R                2
Go               2
Objective-C      2
Assembly         2
C#               1
Ruby             1
Matlab           1
Perl             1
Lisp             1

For 40 - 49 (13)
total           13
Python          11
Java             6
JavaScript       6
C                4
C++              4
Visual Basic     4
TypeScript       2
PHP              2
C#               2
Perl             2
R                1

For 50 - 59 (18)
Python          18
total           18
C               11
Perl            10
C++              8
Java             8
R                6
JavaScript       5
Matlab           5
Ruby             4
C#               4
Fortran          4
Lisp             4
Assembly      

## Bonus: other languages known

In [13]:
df3 = df.loc[:,['other_langs', 'age']]
df3.index = df3.pop('age')
df3

Unnamed: 0_level_0,other_langs
age,Unnamed: 1_level_1
50 - 59,"awk,Julia"
40 - 49,
30 - 39,
50 - 59,
30 - 39,
50 - 59,SQL
40 - 49,
30 - 39,SQL (arguably also considered a programming la...
20 - 29,
50 - 59,


In [14]:
df3['other_langs'] = df3['other_langs'].str.split(',')
df3.dropna(axis=0, inplace=True)
other_langs = df3.explode('other_langs')
other_langs

Unnamed: 0_level_0,other_langs
age,Unnamed: 1_level_1
50 - 59,awk
50 - 59,Julia
50 - 59,SQL
30 - 39,SQL (arguably also considered a programming la...
30 - 39,SQL
30 - 39,PL/SQL
>= 60,Focus
>= 60,Pascal
>= 60,Object Pascal
40 - 49,C


In [15]:
other_langs.value_counts()

other_langs                                          
SQL                                                      3
Pascal                                                   2
Julia                                                    2
 C++                                                     1
Include                                                  1
awk                                                      1
SQL (arguably also considered a programming language)    1
PowerShell                                               1
Pascal  APL                                              1
Fortran                                                  1
 Delphi                                                  1
Focus                                                    1
C                                                        1
Angular                                                  1
 Wolfram                                                 1
 SQL                                                     1
 P