## Exploratory Data Analysis

### Import Libraries

In [1]:
import sys
import polars as pl
import polars.selectors as pol_sel

### Show Python & Library Versions

In [2]:
l = 8
r = 12

print("Python".rjust(l), ":", sys.version[0:6].ljust(r))
print("Polars".rjust(l), ":", pl.__version__.ljust(r))

  Python : 3.11.4      
  Polars : 1.12.0      


### Load CSV File into Polars DataFrame

In [3]:
df = pl.read_csv("data/dataset.csv")

df

id,Name,Position,Department,University,Location,Profile URL,Expertise,Experience,Qualification,Honours and Awards,Highest Qualification,Has Awards,Start Year,Years of Experience
i64,str,str,str,str,str,str,str,str,str,str,str,i64,f64,f64
556330,"""Ms Dixita Kagathara""","""Assistant Professor""","""Department of Computer Science…","""Darshan University""","""Gujarat""","""https://darshan.irins.org/prof…","""Computer Science Software Engi…","""2021 - …","""2013; MTech; Gujarat Universit…",,"""MTech""",0,2021.0,3.0
556331,"""Mr Firoz Sherasiya""","""Assistant Professor""","""Department of Computer Science…","""Darshan University""","""Gujarat""","""https://darshan.irins.org/prof…","""Computer Science Software Engi…","""2022 - …","""2015; MTech; Gujarat Technolog…",,"""MTech""",0,2022.0,2.0
556358,"""Mr Bhavin Kanani""","""Assistant Professor""","""Department of Electrical Engin…","""Darshan University""","""Gujarat""","""https://darshan.irins.org/prof…","""Electrical and Electronic Engi…","""2021 - …","""2008; PhD; Gujarat Technologic…",,"""PhD""",0,2021.0,3.0
556374,"""Ms Vrunda Amrutiya""","""Assistant Professor""","""Department of Humanities and S…","""Darshan University""","""Gujarat""","""https://darshan.irins.org/prof…","""Humanities, Multidisciplinary""","""2024 - …","""2023; MSc; Saurashtra Universi…",,"""MSc""",0,2024.0,0.0
556357,"""Mr Raj Mehta""","""Assistant Professor""","""Department of Electrical Engin…","""Darshan University""","""Gujarat""","""https://darshan.irins.org/prof…","""Electrical and Electronic Engi…","""2021 - …","""2017; MTech; V.I.T. University…",,"""MTech""",0,2021.0,3.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
204095,"""Prof Shib Shankar Banerjee""","""Assistant Professor""","""Department of Materials Scienc…","""Indian Institute of Technology…","""Delhi""","""https://iitd.irins.org/profile…","""Materials Science Multidiscipl…","""2020 - …","""2016; Ph.D; Indian Institute o…",,"""Ph.D""",0,2020.0,4.0
202739,"""Mrs Aparna R""","""Assistant Professor""","""Department of Computer Science…","""M S Ramaiah Institute of Techn…","""Karnataka""","""https://msrit.irins.org/profil…","""Computer Science Theory and Me…","""2015 - …","""2007; M.Tech""",,"""M.Tech""",0,2015.0,9.0
170536,"""Dr Abhishek Majumder""","""Associate Professor""",,"""Tripura University""","""Tripura""","""https://tripurauniv.irins.org/…","""Computer Science Information S…","""2023 - …","""2018; Ph.D; Assam University""","""['2006; GATE scholarchip; AICT…","""Ph.D""",1,2023.0,1.0
206814,"""Dr Jaya Kishor Seth""","""Assistant Professor (Grade-II)""","""Department of Zoology""","""Berhampur University""","""Odisha""","""https://buodisha.irins.org/pro…","""Zoology""","""2022 - …","""2020; Ph.D.; Berhampur Univers…","""['2024; Gold and Elite; NPTEL …","""Ph.D""",1,2022.0,2.0


### Display First Few Rows to Understand Structure of Data

In [4]:
print(df.head())

shape: (5, 15)
┌────────┬─────────────┬─────────────┬─────────────┬───┬────────────┬────────┬────────┬────────────┐
│ id     ┆ Name        ┆ Position    ┆ Department  ┆ … ┆ Highest    ┆ Has    ┆ Start  ┆ Years of   │
│ ---    ┆ ---         ┆ ---         ┆ ---         ┆   ┆ Qualificat ┆ Awards ┆ Year   ┆ Experience │
│ i64    ┆ str         ┆ str         ┆ str         ┆   ┆ ion        ┆ ---    ┆ ---    ┆ ---        │
│        ┆             ┆             ┆             ┆   ┆ ---        ┆ i64    ┆ f64    ┆ f64        │
│        ┆             ┆             ┆             ┆   ┆ str        ┆        ┆        ┆            │
╞════════╪═════════════╪═════════════╪═════════════╪═══╪════════════╪════════╪════════╪════════════╡
│ 556330 ┆ Ms Dixita   ┆ Assistant   ┆ Department  ┆ … ┆ MTech      ┆ 0      ┆ 2021.0 ┆ 3.0        │
│        ┆ Kagathara   ┆ Professor   ┆ of Computer ┆   ┆            ┆        ┆        ┆            │
│        ┆             ┆             ┆ Science…    ┆   ┆            ┆       

### Retrieve Basic Information About DataFrame

In [5]:
print(df.shape)
print(df.dtypes)

(15500, 15)
[Int64, String, String, String, String, String, String, String, String, String, String, String, Int64, Float64, Float64]


### Display Summary Statistics for All Columns

In [6]:
summary = df.describe()
print(summary)

shape: (9, 16)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ statistic ┆ id        ┆ Name      ┆ Position  ┆ … ┆ Highest   ┆ Has       ┆ Start     ┆ Years of │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ Qualifica ┆ Awards    ┆ Year      ┆ Experien │
│ str       ┆ f64       ┆ str       ┆ str       ┆   ┆ tion      ┆ ---       ┆ ---       ┆ ce       │
│           ┆           ┆           ┆           ┆   ┆ ---       ┆ f64       ┆ f64       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆ str       ┆           ┆           ┆ f64      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ count     ┆ 15500.0   ┆ 15500     ┆ 15500     ┆ … ┆ 15500     ┆ 15500.0   ┆ 15197.0   ┆ 15197.0  │
│ null_coun ┆ 0.0       ┆ 0         ┆ 0         ┆ … ┆ 0         ┆ 0.0       ┆ 303.0     ┆ 303.0    │
│ t         ┆           ┆           ┆           ┆   ┆           ┆           

### Find Longest Text Length in Each Column

In [7]:
# Create an empty list to store max lengths for each string column
longest_text_lengths = []

# Loop through the columns to check for string columns
string_columns = [col for col in df.columns if df[col].dtype == pl.Utf8]

max_lengths = {}
for col in string_columns:
    max_length = df.select(pl.col(col).str.len_chars().max()).to_numpy()[0, 0]
    max_lengths[col] = max_length

df_max_lengths = pl.DataFrame(max_lengths)

df_max_lengths

Name,Position,Department,University,Location,Profile URL,Expertise,Experience,Qualification,Honours and Awards,Highest Qualification
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
196,50,109,129,27,57,168,2584,4632,15942,9


### Retrieve Data Types of All Columns

In [8]:
print("Column data types:\n", df.dtypes)

Column data types:
 [Int64, String, String, String, String, String, String, String, String, String, String, String, Int64, Float64, Float64]


### Count Unique Values in Each Column

In [9]:
all_columns = [col for col in df.columns]

for col in all_columns:
    unique_counts = df[col].n_unique()
    print(f"Unique values in {col} :".rjust(48), f"{unique_counts}".ljust(6))

                           Unique values in id : 15500 
                         Unique values in Name : 14995 
                     Unique values in Position : 189   
                   Unique values in Department : 2231  
                   Unique values in University : 1166  
                     Unique values in Location : 109   
                  Unique values in Profile URL : 15500 
                    Unique values in Expertise : 442   
                   Unique values in Experience : 14185 
                Unique values in Qualification : 11755 
           Unique values in Honours and Awards : 7071  
        Unique values in Highest Qualification : 10    
                   Unique values in Has Awards : 2     
                   Unique values in Start Year : 54    
          Unique values in Years of Experience : 54    


### Check Distribution of Numerical Columns

In [10]:
numerical_cols = [item for item in all_columns if item not in string_columns]
numerical_cols = [item for item in numerical_cols if item not in ['id']]

numerical_cols

for col in numerical_cols:
    distribution = df.select(col).describe()
    print(col)
    print(distribution, '\n\n')

Has Awards
shape: (9, 2)
┌────────────┬────────────┐
│ statistic  ┆ Has Awards │
│ ---        ┆ ---        │
│ str        ┆ f64        │
╞════════════╪════════════╡
│ count      ┆ 15500.0    │
│ null_count ┆ 0.0        │
│ mean       ┆ 0.457032   │
│ std        ┆ 0.498166   │
│ min        ┆ 0.0        │
│ 25%        ┆ 0.0        │
│ 50%        ┆ 0.0        │
│ 75%        ┆ 1.0        │
│ max        ┆ 1.0        │
└────────────┴────────────┘ 


Start Year
shape: (9, 2)
┌────────────┬─────────────┐
│ statistic  ┆ Start Year  │
│ ---        ┆ ---         │
│ str        ┆ f64         │
╞════════════╪═════════════╡
│ count      ┆ 15197.0     │
│ null_count ┆ 303.0       │
│ mean       ┆ 2012.029874 │
│ std        ┆ 9.580049    │
│ min        ┆ 1969.0      │
│ 25%        ┆ 2007.0      │
│ 50%        ┆ 2014.0      │
│ 75%        ┆ 2020.0      │
│ max        ┆ 2024.0      │
└────────────┴─────────────┘ 


Years of Experience
shape: (9, 2)
┌────────────┬─────────────────────┐
│ statistic  ┆ Yea

### Retrieve Unique List of Values for Select Features/Columns

In [14]:
cols_2_check = [
    "Highest Qualification",
    "Has Awards",
    "Start Year",
    "Years of Experience"
]

for col in cols_2_check:
    unique_values = df[col].unique().sort().to_list()
    print(f"Column: {col} [{len(unique_values)}]")
    print(f"Unique values: {unique_values}\n")

Column: Highest Qualification [10]
Unique values: ['B.Tech', 'BTech', 'Doctorate', 'M.Sc', 'M.Tech', 'MSc', 'MTech', 'Other', 'Ph.D', 'PhD']

Column: Has Awards [2]
Unique values: [0, 1]

Column: Start Year [54]
Unique values: [None, 1969.0, 1971.0, 1974.0, 1975.0, 1976.0, 1977.0, 1978.0, 1979.0, 1980.0, 1981.0, 1982.0, 1983.0, 1984.0, 1985.0, 1986.0, 1987.0, 1988.0, 1989.0, 1990.0, 1991.0, 1992.0, 1993.0, 1994.0, 1995.0, 1996.0, 1997.0, 1998.0, 1999.0, 2000.0, 2001.0, 2002.0, 2003.0, 2004.0, 2005.0, 2006.0, 2007.0, 2008.0, 2009.0, 2010.0, 2011.0, 2012.0, 2013.0, 2014.0, 2015.0, 2016.0, 2017.0, 2018.0, 2019.0, 2020.0, 2021.0, 2022.0, 2023.0, 2024.0]

Column: Years of Experience [54]
Unique values: [None, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49