In [11]:
import pandas as pd
import numpy as np

data = {
    "CandidateID": [101, 102, 103, 104, 105, 106, 107],
    "Skills": [
        "Python, SQL",        # Qualified
        "Java, C++",          # Wrong Skill
        np.nan,               # MISSING SKILL (Drop this)
        "python expert",      # Qualified (lowercase)
        "Excel, Word",        # Wrong Skill
        "PYTHON, GO",         # Qualified (Uppercase)
        "Python"              # Qualified
    ],
    "Years_Exp": [5, 10, 2, 4, 1, 3, np.nan], # Note the missing value!
    "Status": ["Review", "Reject", "Reject", "Review", "Reject", "Hired", "Review"]
}

df = pd.DataFrame(data)
df


Unnamed: 0,CandidateID,Skills,Years_Exp,Status
0,101,"Python, SQL",5.0,Review
1,102,"Java, C++",10.0,Reject
2,103,,2.0,Reject
3,104,python expert,4.0,Review
4,105,"Excel, Word",1.0,Reject
5,106,"PYTHON, GO",3.0,Hired
6,107,Python,,Review


In [12]:
df.dropna(subset = ['Skills'])

Unnamed: 0,CandidateID,Skills,Years_Exp,Status
0,101,"Python, SQL",5.0,Review
1,102,"Java, C++",10.0,Reject
3,104,python expert,4.0,Review
4,105,"Excel, Word",1.0,Reject
5,106,"PYTHON, GO",3.0,Hired
6,107,Python,,Review


In [18]:
df.loc[6, 'Years_Exp'] = 0
df

Unnamed: 0,CandidateID,Skills,Years_Exp,Status
0,101,"Python, SQL",5.0,Review
1,102,"Java, C++",10.0,Reject
2,103,,2.0,Reject
3,104,python expert,4.0,Review
4,105,"Excel, Word",1.0,Reject
5,106,"PYTHON, GO",3.0,Hired
6,107,Python,0.0,Review


In [24]:
df['Skills'] = df['Skills'].str.lower()
df

Unnamed: 0,CandidateID,Skills,Years_Exp,Status
0,101,"python, sql",5.0,Review
1,102,"java, c++",10.0,Reject
2,103,,2.0,Reject
3,104,python expert,4.0,Review
4,105,"excel, word",1.0,Reject
5,106,"python, go",3.0,Hired
6,107,python,0.0,Review


In [27]:
qualified_candidates = (df['Skills'].str.contains('python', na=False)) & (df['Years_Exp'] >= 3)
df[qualified_candidates]

Unnamed: 0,CandidateID,Skills,Years_Exp,Status
0,101,"python, sql",5.0,Review
3,104,python expert,4.0,Review
5,106,"python, go",3.0,Hired


In [34]:
df[qualified_candidates] ['Status'].value_counts()

Status
Review    2
Hired     1
Name: count, dtype: int64

In [36]:
df.loc[qualified_candidates, 'Status'].value_counts()

Status
Review    2
Hired     1
Name: count, dtype: int64

In [37]:
import pandas as pd

data = {
    "ReviewID": [1, 2, 3, 4, 5],
    "Text": [
        "Love it!", 
        "Worst product ever. Do not buy.", 
        "it is okay", 
        "AMAZING performance and battery life", 
        "Trash."
    ],
    "Sentiment": ["Positive", "Negative", "Neutral", "Positive", "Negative"]
}

df = pd.DataFrame(data)
df

Unnamed: 0,ReviewID,Text,Sentiment
0,1,Love it!,Positive
1,2,Worst product ever. Do not buy.,Negative
2,3,it is okay,Neutral
3,4,AMAZING performance and battery life,Positive
4,5,Trash.,Negative


In [39]:
#using map to turn text to number
text_to_number = {"Positive": 1, "Negative": 0, "Neutral": -1}
df['Target'] = df['Sentiment'].map(text_to_number)
df

Unnamed: 0,ReviewID,Text,Sentiment,Target
0,1,Love it!,Positive,1
1,2,Worst product ever. Do not buy.,Negative,0
2,3,it is okay,Neutral,-1
3,4,AMAZING performance and battery life,Positive,1
4,5,Trash.,Negative,0


In [41]:
df['text_length'] = df['Text'].apply(len)
df

Unnamed: 0,ReviewID,Text,Sentiment,Target,text_length
0,1,Love it!,Positive,1,8
1,2,Worst product ever. Do not buy.,Negative,0,31
2,3,it is okay,Neutral,-1,10
3,4,AMAZING performance and battery life,Positive,1,36
4,5,Trash.,Negative,0,6


In [47]:
df['analyst_filt'] = (df['Target'] == 0) & (df['text_length'] > 10)
# df[['ReviewID', 'analyst_filt']]
df

Unnamed: 0,ReviewID,Text,Sentiment,Target,text_length,analyst_filt
0,1,Love it!,Positive,1,8,False
1,2,Worst product ever. Do not buy.,Negative,0,31,True
2,3,it is okay,Neutral,-1,10,False
3,4,AMAZING performance and battery life,Positive,1,36,False
4,5,Trash.,Negative,0,6,False
