In [1]:
import pandas as pd 
import re
import numpy as np

In [2]:
#ordinal variables, categorical variable with order
df = pd.DataFrame(['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D'],
                  index=['excellent', 'excellent', 'excellent', 'goog', 'good', 'good',
                         'ok', 'ok', 'ok', 'poor', 'poor'], 
                  columns=['Grades'])
df

Unnamed: 0,Grades
excellent,A+
excellent,A
excellent,A-
goog,B+
good,B
good,B-
ok,C+
ok,C
ok,C-
poor,D+


In [3]:
#Now, if we check the datatype of this column, we see that it's just an object, since we set string values
df.dtypes

Grades    object
dtype: object

In [4]:
#We can, however, tell pandas that we want to change the type to category, using the astype() function 
df['Grades'].astype('category')

excellent    A+
excellent     A
excellent    A-
goog         B+
good          B
good         B-
ok           C+
ok            C
ok           C-
poor         D+
poor          D
Name: Grades, dtype: category
Categories (11, object): ['A', 'A+', 'A-', 'B', ..., 'C+', 'C-', 'D', 'D+']

In [5]:
#interesting though is that our data isn't just categorical, but it's ordered. That is, an A- comes after a B+,
#and B comes before a B+
my_categories = pd.CategoricalDtype(categories=['D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A', 'A+'],
                                   ordered=True)
#then we can just pass this to the astype() function
grades=df['Grades'].astype(my_categories)
grades

excellent    A+
excellent     A
excellent    A-
goog         B+
good          B
good         B-
ok           C+
ok            C
ok           C-
poor         D+
poor          D
Name: Grades, dtype: category
Categories (11, object): ['D' < 'D+' < 'C-' < 'C' ... 'B+' < 'A-' < 'A' < 'A+']

In [6]:
#we see that the lexicographical comparison returns results we were not intending
df[df['Grades']>'C']

Unnamed: 0,Grades
ok,C+
ok,C-
poor,D+
poor,D


In [7]:
#So a C+ is great than a C, but C- and D certainly are not. However, if we broadcast over the dataframe
#wich has the type set to an ordered categorical

grades[grades>'C']

excellent    A+
excellent     A
excellent    A-
goog         B+
good          B
good         B-
ok           C+
Name: Grades, dtype: category
Categories (11, object): ['D' < 'D+' < 'C-' < 'C' ... 'B+' < 'A-' < 'A' < 'A+']

In [8]:
#we gonna ordered the Serie too similar to df
ser_1 = pd.Series(['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D'], dtype='category')
ser_1

0     A+
1      A
2     A-
3     B+
4      B
5     B-
6     C+
7      C
8     C-
9     D+
10     D
dtype: category
Categories (11, object): ['A', 'A+', 'A-', 'B', ..., 'C+', 'C-', 'D', 'D+']

In [9]:
ser_1.cat.as_ordered() #notemos que para ello debemos hacer que las "notas" deben ir decreciente orden

0     A+
1      A
2     A-
3     B+
4      B
5     B-
6     C+
7      C
8     C-
9     D+
10     D
dtype: category
Categories (11, object): ['A' < 'A+' < 'A-' < 'B' ... 'C+' < 'C-' < 'D' < 'D+']

In [14]:
ser_2 = pd.Series(['D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A', 'A+'], dtype='category')
ser_2

0      D
1     D+
2     C-
3      C
4     C+
5     B-
6      B
7     B+
8     A-
9      A
10    A+
dtype: category
Categories (11, object): ['A', 'A+', 'A-', 'B', ..., 'C+', 'C-', 'D', 'D+']

In [15]:
ser_2.cat.as_ordered(inplace=True)

0      D
1     D+
2     C-
3      C
4     C+
5     B-
6      B
7     B+
8     A-
9      A
10    A+
dtype: category
Categories (11, object): ['A' < 'A+' < 'A-' < 'B' ... 'C+' < 'C-' < 'D' < 'D+']

In [10]:
#In ordinal variables we can use a certain set of mathematical operators, like minimum, maximum, etc., on the ordinal d
df_Rank = pd.read_csv('cwurData.csv')
df_Rank.head()

Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year
0,1,Harvard University,USA,1,7,9,1,1,1,1,,5,100.0,2012
1,2,Massachusetts Institute of Technology,USA,2,9,17,3,12,4,4,,1,91.67,2012
2,3,Stanford University,USA,3,17,11,5,4,2,2,,15,89.5,2012
3,4,University of Cambridge,United Kingdom,1,10,24,4,16,16,11,,50,86.17,2012
4,5,California Institute of Technology,USA,4,2,29,7,37,22,22,,18,85.21,2012


In [11]:
df_Rank.count()

world_rank              2200
institution             2200
country                 2200
national_rank           2200
quality_of_education    2200
alumni_employment       2200
quality_of_faculty      2200
publications            2200
influence               2200
citations               2200
broad_impact            2000
patents                 2200
score                   2200
year                    2200
dtype: int64

In [12]:
def create_category(ranking): 
    if (ranking>=1) & (ranking<=100):
        return 'First tier top university'
    elif (ranking>=101) & (ranking<=200):
        return 'Secund tier top university'
    elif (ranking>=201) & (ranking<=300): 
        return 'Third tier top university'
    return 'Other top university'

df_Rank['Rank Level'] = df_Rank['world_rank'].apply(lambda x: create_category(x))
df_Rank.head()

Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year,Rank Level
0,1,Harvard University,USA,1,7,9,1,1,1,1,,5,100.0,2012,First tier top university
1,2,Massachusetts Institute of Technology,USA,2,9,17,3,12,4,4,,1,91.67,2012,First tier top university
2,3,Stanford University,USA,3,17,11,5,4,2,2,,15,89.5,2012,First tier top university
3,4,University of Cambridge,United Kingdom,1,10,24,4,16,16,11,,50,86.17,2012,First tier top university
4,5,California Institute of Technology,USA,4,2,29,7,37,22,22,,18,85.21,2012,First tier top university


In [13]:
print('There are {} rows and {} columns'.format(df_Rank.shape[0], df_Rank.shape[1]))

There are 2200 rows and 15 columns
