In [1]:
import pandas as pd
import numpy as np

In [26]:
filename = "/Users/ahmedabukar/Downloads/:/data/so_2021_survey_results.csv"
df = pd.read_csv(filename,
                usecols=["LanguageHaveWorkedWith",
                        "LanguageWantToWorkWith",
                        "Country", "CompTotal"])
df.shape

(83439, 4)

In [3]:
# what are the different programming langauges that developers currently use?
def explode_strings(s):
    return (
        s.str.split(";")
        .explode()
    )

(
    df
    ["LanguageHaveWorkedWith"]
    .pipe(explode_strings)
    .drop_duplicates()
)

0               C++
0          HTML/CSS
0        JavaScript
0       Objective-C
0               PHP
0             Swift
1            Python
2          Assembly
2                 C
2                 R
2              Rust
3        TypeScript
4        Bash/Shell
4               SQL
5                C#
5              Java
5           Node.js
5        PowerShell
7              Ruby
12             Perl
19           Matlab
24           Kotlin
25            Julia
28          Haskell
29           Delphi
31               Go
31            Scala
38             Dart
46              NaN
50              VBA
85           Groovy
88          Clojure
133             APL
133            LISP
143              F#
371          Elixir
439          Erlang
522         Crystal
8054          COBOL
Name: LanguageHaveWorkedWith, dtype: object

In [4]:
# what are the 10 the programming languages most commonly used?
top_10_have_work_with = (
    df["LanguageHaveWorkedWith"]
    .pipe(explode_strings)
    .value_counts()
    .head(10)
)

top_10_have_work_with

LanguageHaveWorkedWith
JavaScript    53587
HTML/CSS      46259
Python        39792
SQL           38835
Java          29162
Node.js       27975
TypeScript    24909
C#            22984
Bash/Shell    22385
C++           20057
Name: count, dtype: int64

In [5]:
# what are the 10 programming languages people most want to use?
top_10_want_to_work = (
    df["LanguageWantToWorkWith"]
    .pipe(explode_strings)
    .value_counts()
    .head(10)
)

top_10_want_to_work

LanguageWantToWorkWith
JavaScript    37008
Python        34929
HTML/CSS      29353
TypeScript    26905
SQL           26631
Node.js       24100
C#            17999
Java          17222
Rust          15865
Go            15788
Name: count, dtype: int64

In [6]:
# what languages are on both top-10 lists?
top_10_want_to_work.index.intersection(top_10_have_work_with.index)

Index(['JavaScript', 'Python', 'HTML/CSS', 'TypeScript', 'SQL', 'Node.js',
       'C#', 'Java'],
      dtype='object')

In [7]:
# what languages in the top 10 have people worked with but don't want to work with in the future?
top_10_have_work_with.index.difference(top_10_want_to_work.index)

Index(['Bash/Shell', 'C++'], dtype='object')

In [20]:
# what is the most popular(current) language used by people in each country?
(
    df
    ["LanguageHaveWorkedWith"]
    .pipe(explode_strings)
    .groupby(df["Country"])
    .agg(pd.Series.mode)
)

Country
Afghanistan                                         JavaScript
Albania                                             JavaScript
Algeria                                             JavaScript
Andorra                                 [HTML/CSS, JavaScript]
Angola                                  [HTML/CSS, JavaScript]
                                                 ...          
Venezuela, Bolivarian Republic of...                JavaScript
Viet Nam                                            JavaScript
Yemen                                                      PHP
Zambia                                              JavaScript
Zimbabwe                                            JavaScript
Name: LanguageHaveWorkedWith, Length: 170, dtype: object

In [19]:
# what is th mean number of languages used in the last year?
languages = pd.read_csv("/Users/ahmedabukar/Downloads/languages.csv",
                       usecols=["title"])

(
    df
    ["LanguageHaveWorkedWith"]
    .str.split(";")
    .str.len()
    .mean()
)

5.307487409065473

In [10]:
# what is the greatest number of languages people listed as having used in the last year?
(
    df
    ["LanguageHaveWorkedWith"]
    .str.split(";")
    .str.len()
    .max()
    
)

38.0

In [24]:
# how many people chose that largest number?
(
    df
    .loc[df
    ["LanguageHaveWorkedWith"]
    .str.split(";")
    .str.len() == 38,
    "LanguageHaveWorkedWith"]
    .count()
)

5

In [12]:
# how many people in the survey claim salaries of $2 million or more?
(
    df
    [df["CompTotal"] >= 2000000]
    ["CompTotal"]
    .count()
)

2369

In [13]:
# remove rows in which salaries are less $2 million ?
df = df.loc[df["CompTotal"] < 2000000]

In [14]:
# turn the LangugaeWorkedWith column to 'dummy' columns in df
# such that each language is its own columns
languages_dummies = df["LanguageHaveWorkedWith"].str.get_dummies(sep=";")

In [15]:
# determine what combinations is best if you want to maximise your salary 
# and have to chose two programming languages from python, Javascript and java.
df = pd.concat([df, languages_dummies], axis="columns")

In [16]:
# what is the salary of someone who knowns python and javascript?

df['CompTotal'][(df['Python'] == 1) &
                (df['JavaScript'] == 1) &
                (df['Java'] == 0)].mean()

126817.99470235605

In [17]:
# salary for python and Java without JavaScript?
df["CompTotal"][(df["Python"] == 1) &
(df["Java"] == 1) &
(df["JavaScript"] == 0)].mean()

162737.10379596677

In [18]:
# salary for Java and JavaScript without Python?
df["CompTotal"][(df["Python"] == 0) &
(df["Java"] == 1) &
(df["JavaScript"] == 1)].mean()

140867.65981559738

In [44]:
# load the data with NEWStuch column?
df = pd.read_csv(filename,
                usecols=["NEWStuck", "Gender", "YearsCodePro", "YearsCode"])
df.head()

Unnamed: 0,YearsCode,YearsCodePro,NEWStuck,Gender
0,,,Call a coworker or friend;Visit Stack Overflow...,Man
1,7.0,,Visit Stack Overflow;Google it,Man
2,,,Visit Stack Overflow;Google it;Watch help / tu...,Man
3,,,Call a coworker or friend;Visit Stack Overflow...,Man
4,17.0,10.0,Visit Stack Overflow;Go for a walk or other ph...,Man


In [32]:
# when developers are stuck what are the three things that they likely to do?
(
    df["NEWStuck"]
    .str.split(";")
    .explode()
    .value_counts()
    .head(3)
)

NEWStuck
Google it                            74491
Visit Stack Overflow                 66410
Do other work and come back later    39871
Name: count, dtype: int64

In [40]:
# what proportion of the survey respondonts marked their gender as Man?
# Does that proportion seem similar to your real-life experience?
(
    df["Gender"]
    .str.split(";")
    .explode()
    .value_counts(normalize=True)
)

Gender
Man                                                  0.906957
Woman                                                0.052570
Prefer not to say                                    0.017339
Non-binary, genderqueer, or gender non-conforming    0.014044
Or, in your own words:                               0.009090
Name: proportion, dtype: float64

In [52]:
# on average, what proportion of their years coding have been done proffesionally?
df.loc[df["YearsCode"] == "Less than 1 year", "YearsCode"] = 0
df.loc[df["YearsCode"] == "More than 50 years", "YearsCode"] = 51

df.loc[df["YearsCodePro"] == "Less than 1 year", "YearsCodePro"] = 0
df.loc[df["YearsCodePro"] == "More than 50 years", "YearsCodePro"] = 51

# drop Nan
df = df.dropna(subset=["YearsCode", "YearsCodePro"])

# turn into integers
df["YearsCode"] = df["YearsCode"].astype(np.int16)
df["YearsCodePro"] = df["YearsCodePro"].astype(np.int16)
df = df[df["YearsCode"] != 0]

(df["YearsCode"] / df["YearsCodePro"]).mean()

2.0714732301931247

In [54]:
df[df["YearsCode"] != 0]

Unnamed: 0,YearsCode,YearsCodePro,NEWStuck,Gender
4,17,10,Visit Stack Overflow;Go for a walk or other ph...,Man
8,6,4,Visit Stack Overflow;Google it;Panic,Man
9,7,4,Call a coworker or friend;Visit Stack Overflow...,Man
10,16,10,Visit Stack Overflow;Google it,Man
11,12,5,Call a coworker or friend;Visit Stack Overflow...,Man
...,...,...,...,...
83434,6,5,Call a coworker or friend;Google it,Man
83435,4,2,Call a coworker or friend;Visit Stack Overflow...,Man
83436,10,4,Call a coworker or friend;Visit Stack Overflow...,Man
83437,5,3,Call a coworker or friend;Visit Stack Overflow...,Man
