In [None]:
# -*- coding: utf-8 -*-
"""
Created on Mon Mar  1 10:23:07 2021

@author: Jinqi Zhang
"""

#* source_code.py
#*
#* ANLY 555 Spring 2021
#* Project 2
#*
#* Due on: 3/8/2021
#* Authors: Jinqi Zhang, Jingyi Wang, Jinglin Liang, Xueyan Liu
#*
#*
#* In accordance with the class policies and Georgetown's
#* Honor Code, I certify that, with the exception of the
#* class resources and those items noted below, I have neither
#* given nor received any assistance on this project other than
#* the TAs, professor, textbook and teammates.
#*
#* References not otherwise commented within the program source code.
#* Note that you should not mention any help from the TAs, the professor,
#* or any code taken from the class textbooks.
#*

#=====================================================================
# Superclass: DataSet
#
# Subclass: 
# TimeSeriesDataSet
# TextDataSet
# QuantDataSet
# QualDataSet
#=====================================================================


import nltk
import regex
import string
import unicodedata
nltk.download('stopwords')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk import WordNetLemmatizer
from nltk import word_tokenize
from nltk import PorterStemmer
from nltk.draw.dispersion import dispersion_plot
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
import pandas as pd
import numpy as np
pd.set_option('mode.chained_assignment', None)




# Super Class: **DataSet**
Public Member Functions
```
def 	__init__ (self, filename)  
def 	clean (self)  
def 	explore (self, plot_type, column1, column2=None)  
def 	mean (self, column_name)  
def 	median (self, column_name_median)  
def 	mode (self, column_name_mode)
def 	order (self, column_name_order, decreasing)
```

In [None]:
class DataSet:
    
    def __init__(self, filename):
        """Constructor for super class Dataset"""
        self._filename = filename
    
    def _readFromCSV(self):
        """The function could read csv file"""
        self.df = pd.read_csv(self._filename)

    def _load(self):
        """The function could ask the user for the type of file"""
        self._type = input("""input your dataset type and get instruction of each type
                           \ntype of data set (quantitative/qualitative/timeseries/textdata):""")
       
        if self._type == "qualitative":
            print("""Call QualDataSet() for qualitative data.
                  \nIncluding
                  \nclean() -- fill in missing values with the mode
                  \nexplore() -- create hist or pie plots \nDefault: explore(plot_type_qual, column_qual)
                  \ntable() -- show the frequency of each category of a certain column \nDefault: table(column_table) \n
                  """)
        elif self._type == "quantitative":
            print("""Call QuantDataSet() for quantitative data.
                  \nIncluding
                  \nclean() -- fill in missing values with the mean
                  \nexplore() -- create hist or scatter plots \nDefault: explore(plot_type, column1, column2=None)
                  \nmean() -- get the mean of certain quantitative column \nDefault: mean(column_name)
                  \nmedian() -- get the median of certain quantitative column \nDefault: median(column_name_median)
                  \nmode() -- get the mode of certain quantitative column \nDefault: mode(column_name_mode)
                  \norder() -- present certain quantitative column by specific order \nDefault: order(column_name_order) \n
                  """)
        elif self._type == "timeseries":
            print("""Call TimeSeriesDataSet() for timeseries data.
                  \nIncluding
                  \nclean() -- run a median filter with optional parameters which determine the filter size and return the dataframe with new column of clean data \nDefault: clean(tsColnameList, filter_size=7)
                  \nexplore() -- create timeseries or lag value relationship plots \nDefault: explore(tsColnameList, date_colname=None, lag=1, vis_type="all")
                  \nget_dataframe() -- get a dataframe with certain columns \nDefault: get_dataframe(tsColnameList=None)
                  \nget_colname() -- get the column names
                  \ntransformation(), differencing(), movingaverage() -- methods to remove trend and seasonality \nDefault: transformation(tsColnameList, method="log", power=0.5) \ndifferencing(tsColnameList, differences=1) \nmovingaverage(tsColnameList, window=3) \n
                  """)
        elif self._type == "textdata":
            print("""Call TextDataSet() for text data.
                  \nIncluding
                  \nclean() -- choose whether or not to remove stopwords, remove punctuations, remove diacritics, lowercase, stem and lemmatize and return a token list \nDefault: clean(colname, lower=True, remove_diacritics=True, remove_punct=True, remove_stop=True, stem_tokens=False, lemmatize_tokens=False)
                  \nexplore() -- create worldcloud or dispersion plots \nDefault: explore(token_list, worldcloud=True, dispersion=True)
                  \ntoken_list() -- change the nested list to a flat list \nDefault: token_list(nested_list)
                  \nwordfrequency() -- get the number of times each word appears \nDefault: wordfrequency(token_list, n, ascending=True) \n
                  """)
        else:
            raise ValueError("Unavailable type.")
        
    def clean(self):
        """The function could get instruction of cleaning the dataset for each type"""
        if self._type == "qualitative":
            print("please call: QualDataSet(filename).clean() \n")
        elif self._type == "quantitative":
            print("please call: QuantDataSet(filename).clean() \n")
        elif self._type == "timeseries":
            print("""please call: TimeSeriesDataSet(filename).clean() \nDefault: clean(tsColnameList=None, filter_size=7) \n""")
        elif self._type == "textdata":
            print("""please call: TextDataSet(filename).clean() \nDefault: clean(lower=True, remove_diacritics=True, remove_punct=True, remove_stop=True, stem_tokens=False, lemmatize_tokens=False) \n""")
        else:
            raise ValueError("Unavailable type.")
    
    def explore(self):
        """The function could get instruction of creating visualizations of data for each type"""
        if self._type == "qualitative":
            print("""please call: QualDataSet(filename).explore() \nDefault: explore(plot_type_qual, column_qual) \n""")
        elif self._type == "quantitative":
            print("""please call: QuantDataSet(filename).explore() \nDefault: explore(plot_type, column1, column2=None) \n""")
        elif self._type == "timeseries":
            print("""please call: TimeSeriesDataSet(filename).explore() \nDefault: explore(tsColnameList, date_colname=None, lag=1, vis_type="all") \n""")
        elif self._type == "textdata":
            print("""please call: TextDataSet(filename).explore() \nDefault: explore(token_list, worldcloud=True, dispersion=True) \n""")
        else:
            raise ValueError("Unavailable type.")
        
    def head(self):
        """The function could get the head of the dataset"""
        print(self.df.head())
        
    def colname(self):
        """The function could get the column names"""
        print(list(self.df))
        
    def colnum(self):
        """The function could get the number of columns"""
        print(self.df.shape[1])
        
    def rownum(self):
        """The function could get the number of rows"""
        print(self.df.shape[0])
        
    def __getitem__(self, index):
        """Return specific coordinate of dataframe."""
        print(self.df.loc[index[0]][index[1]])
    
    def getcol(self, col):
        """Return specific column of dataframe."""
        print(self.df[col])

# Subclass: **TimeSeriesDataSet**

Public Member Functions
```
def 	__init__ (self, filename)
def 	get_dataframe (self, tsColnameList=None)
def 	get_colname (self)
def 	clean (self, tsColnameList, filter_size=7)
def 	explore (self, tsColnameList, date_colname=None, lag=1, vis_type="all")
def 	transformation (self, tsColnameList, method="log", power=0.5)
def 	differencing (self, tsColnameList, differences=1)
def 	movingaverage (self, tsColnameList, window=3)
```

In [None]:
# Subclass: TimeSeriesDataSet
class TimeSeriesDataSet(DataSet):
    """ 
    This is a subclass for DataSet for handling time series data. 
    """

    def __init__(self, filename):
        """ 
        The constructor for TimeSeriesDataSet subclass. 
  
        Parameters: 
           filename (str): The name of test file
        """

        super().__init__(filename) 
        # inherit self.df
        DataSet._readFromCSV(self)  
        
    def get_dataframe(self,tsColnameList = None):
        """ 
        get_dataframe Method:
        Get dataframe with selected columns names or full columns
  
        Parameters: 
        tsColnameList (list): The list of columns names
      
        Returns: 
        dataframe: The dataframe with selected columns names
        """
        if tsColnameList!= None:
            # check arguments type and check if inputed colnames are correct.
            if isinstance(tsColnameList, list) and (set(tsColnameList).issubset(self.df.columns.to_list())):
                return (self.df[tsColnameList])
            # or raise error
            else:
                raise TypeError("""tsColnameList = {0} is invalid. The argument: tsColnameList only accepts a list of columns name.
                             \nExample: df.get_dataframe(["var1","var2"])""".format(tsColnameList))

        else:
            return (self.df)
        
    def get_colname(self):
        """ 
        get_colname Method:
        Get columns names of dataframe
  
        Returns: 
        list: The a list of column names
        """
        return list(self.df.columns.tolist())
    
    def clean(self,tsColnameList,filter_size=7):
        """ 
        clean Method:
        Clean the selected timeseries data by median filter algorithm with inputed filter size 
        Add cleaned data as new columns to orginal dataframe.
  
        Parameters: 
        tsColnameList (list): The list of columns names
        filter_size (int) : filter size of median filter (default = 7)
      
        Returns: 
        dataframe: The dataframe with cleaned value columns 
        """
        # check tsColnameList type and check if inputed colnames are correct.
        if isinstance(tsColnameList, list) and (set(tsColnameList).issubset(self.df.columns.to_list())) :
            # check filter_size type
            if isinstance(filter_size, int):
                new_column_list = []
                for i in tsColnameList:
                  # setup new cols name
                  new_column = str("cleaned_"+i)
                  self.df[new_column] = self.df[i]
                  # create list for storing new cols
                  new_column_list.append(new_column)
        
                  # get data of that col 
                  col_temp = self.df[i]
        
                  # update dats by median
                  for j in range(0, len(col_temp)-filter_size):
                    window_list = col_temp[j:j+filter_size]
                    self.df[new_column][j] = np.median(window_list)
                  # update edge data 
                  for k in range (len(col_temp)-filter_size+1,len(col_temp)):
                    window_list = col_temp[k:len(col_temp)]
                    self.df[new_column][k] = np.median(window_list)

                return (self.df[new_column_list])
            else:
                raise TypeError("""filter_size = {0} is invalid. The argument: filter_size only accepts positive integer. 
                                 \nExample: df.clean(["var1"],filter_size = 7)""".format(filter_size))
        
        else:
            raise TypeError("""tsColnameList = {0} is invalid. The argument: tsColnameList only accepts a list of columns name.
                             \nExample: df.clean(["var1"],filter_size = 7)""".format(tsColnameList))



    def explore(self,tsColnameList,date_colname=None,lag=1, vis_type = "all"):
        """
        explore Method: 
        Create 2 type visulizations (timesseries plot and lag plot) for time series data.
  
        Parameters: 
        tsColnameList (list): The list of columns names
        date_colname (str) : The name of date columns 
        lag (int) : The lag for lag plots, (default = 1)
        vis_type (str) : The type of visulization:  "all" for output both times series plots and lag plots;"ts" for only output timesseries plots;"lag" for only output lag plots.  (default = "all")
      
        """
        def ts_plot(): 
        # ts_plot function: create timesseries plot.
            plt.figure(figsize=(10,5))#figsize=(20,10)
            plot_list = []
            plt.grid()
            # add ts lines to one plot
            for i in tsColnameList:
              plot, = plt.plot(pd.to_datetime(self.df[date_colname]), self.df[i], marker='', linestyle='-')
              plot_list.append(plot)
            # setup labels 
            ylab = tsColnameList[0]
            plt.ylabel(ylab)
            plt.title("Time Series Plot of "+ylab)
            plt.xlabel(date_colname)
            #ylab = ' '.join('{},'.format(k) for k in tsColnameList)
            ylab = tsColnameList[0]
            plt.ylabel(ylab)
            plt.legend(plot_list,tsColnameList)
            plt.show()

        def lag_plot():
        # lag_plot function: create lag plot.
            
          # create a plot for each column
          for i in tsColnameList:
              colname = i
              value =  self.df[colname] #self.df[colname]
              length = len(value)
              plt.figure(figsize=(10,5))#figsize=(20,10)
              plt.grid()
              plt.plot(value[0:length-lag],value[lag:length] ,linestyle='', marker='o', markersize=0.9, color="purple")
              # set uo labels
              plt.title("Lag Plot of "+str(colname)+" ( lag = "+str(lag)+")")
              plt.xlabel(colname+r"$_{(t)}$")
              footnote = "(t+" +str(lag)+  ")"
              y_lab = colname+ r"$_{{{}}}$".format(footnote)
              plt.ylabel(y_lab)
              plt.show()
        
        # check vis_type inputed value
        if vis_type == "all":
          tsplot = True
          lagplot = True
        elif vis_type == "ts":
          tsplot = True
          lagplot = False
        elif vis_type  == "lag":
          tsplot = False
          lagplot = True 
        else:
          # if wrong raise errror
          raise ValueError("""vis_type = {0} is invalid. The available vis_type contains: 
                           \n   * "all" for output both times series plots and lag plots,
                           \n   * "ts" for only output timesseries plots.
                           \n   * "lag" for only output lag plots.
                             \nExample: df.explore(["var1],date_colname="Date",lag=2, vis_type = "ts")""".format(vis_type))
        
        # check tsColnameList type and check if inputed colnames are correct.                           
        if isinstance(tsColnameList, list) and (set(tsColnameList).issubset(self.df.columns.to_list())):
          if tsplot == True:
             # check if date_colname correct
            if isinstance(date_colname, str) and (date_colname in self.df.columns.to_list()):
              ts_plot()
            else:
              raise TypeError("""date_colname = {0} is invalid. The argument: lagdate_colname only accepts a column name string that stores the date infomation. 
                                 \nExample: df.explore(["var1],date_colname="Date",lag=2, vis_type = "ts")""".format(date_colname))
          else:
            pass

          if lagplot == True:
             # check if inputed lag vaule correct
            if isinstance(lag, int):
             lag_plot()
            else:
              raise TypeError("""lag = {0} is invalid. The argument: lag only accepts positive integer. 
                                 \nExample: df.explore(["var1],date_colname="Date",lag=2, vis_type = "ts")""".format(lag))
          else:
            pass

        else:
            raise TypeError("""tsColnameList = {0} is invalid. The argument: tsColnameList only accepts a list of columns name.
                             \nExample:  df.explore(["var1],date_colname="Date",lag=2, vis_type = "ts")""".format(tsColnameList))
    
        
    def transformation(self,tsColnameList,method="log",power=0.5):
        """ 
        transformation Method:
        Transform the selected timeseries data by log or power algorithm 
        Add transformed data as new columns to orginal dataframe.
  
        Parameters: 
        tsColnameList (list): The list of columns names
        method (str) : The method type: "log" or power
        power (int or float) : power for power algorithm  (default = 0.5)
      
        Returns: 
        dataframe: The dataframe of transformed value columns 
        """
        # check tsColnameList type and check if inputed colnames are correct.                           
        if isinstance(tsColnameList, list) and (set(tsColnameList).issubset(self.df.columns.to_list())) :
            if  method == "log":
              new_column_list = []
              for i in tsColnameList:
                # setup new cols name
                new_column = str("logged_"+i)
                self.df[new_column] = np.log(self.df[i])
                # create list for storing new cols
                new_column_list.append(new_column)              
              return (self.df[new_column_list])


            elif method == "power":
              if isinstance(power, (float, int)):
                new_column_list = []
                for i in tsColnameList:
                  # setup new cols name
                  new_column = str("powered_"+i)
                  self.df[new_column] = (self.df[i])**power
                  # create list for storing new cols
                  new_column_list.append(new_column)              
                return (self.df[new_column_list])
              else:
              # raise error  for wrong power value
                raise TypeError("""power = {0} is invalid. The argument: lag only accepts integer or float. 
                                 \nExample: df.transformation(["var1],method = "power",power=0.8)""".format(power))

            else:
              # raise error  for wrong method type
              raise ValueError("""method = {0} is invalid. The available method contains: 
                               \n   * "log" for applying log transformation,
                               \n   * "power" for applying power transformation.
                               \nExample:  df.transformation(["var1],method = "power",power=0.8)""".format(method))
        else:
        # raise error  for wrong tsColnameList
          raise TypeError("""tsColnameList = {0} is invalid. The argument: tsColnameList only accepts a list of columns name.
                             \nExample:  df.transformation(["var1],method = "power",power=0.8)""".format(tsColnameList))


    def differencing(self,tsColnameList,differences = 1):
        """ 
        differencing Method:
        Updating the selected timeseries data by differencing algorithm 
        Add differenced data as new columns to orginal dataframe.
  
        Parameters: 
        tsColnameList (list): The list of columns names
        differences (int) : differencing lag value  (default = 1)
      
        Returns: 
        dataframe: The dataframe of differenced value columns 
        """
        if isinstance(tsColnameList, list) and (set(tsColnameList).issubset(self.df.columns.to_list())) :
            if isinstance(differences, (int)):
              new_column_list = []
              for i in tsColnameList:
                # setup new cols name
                new_column = str("diffed_"+i)
                new_column_list.append(new_column) 
                self.df[new_column] = np.nan
                # update new column by diff
                for j in range(differences, len(self.df[new_column])):
                      self.df[new_column][j] = self.df[i][j] - self.df[i][j - differences]
    
              return (self.df[new_column_list])
            else:
              # raise error  for wrong differences value
              raise TypeError("""differences = {0} is invalid. The argument: lag only accepts positive integer. 
                                 \nExample: df.differencing(["var1],differences=2)""".format(differences))

        else: 
          # raise error  for wrong tsColnameList
          raise TypeError("""tsColnameList = {0} is invalid. The argument: tsColnameList only accepts a list of columns name.
                             \nExample:  df.differencing(["var1],differences=2)""".format(tsColnameList))

 
    def movingaverage(self,tsColnameList,window = 3):
        """ 
        movingaverage Method:
        Updating the selected timeseries data by moving average algorithm 
        Add the new data as new columns to orginal dataframe.
  
        Parameters: 
        tsColnameList (list): The list of columns names
        window (int) : window value of moving average algorithm (default = 3)
      
        Returns: 
        dataframe: The dataframe of new columns 
        """
        if isinstance(tsColnameList, list) and (set(tsColnameList).issubset(self.df.columns.to_list())) :
            if isinstance(window, (int)):
              new_column_list = []
              for i in tsColnameList:
                  # setup new cols name
                  new_column = str("moving_"+i)
                  new_column_list.append(new_column) 
                  
                  def moving_average(x, w):
                      # function: compute moving_average
                      return np.convolve(x, np.ones(w), 'valid') / w
                  # format cols and handling edge value 
                  x = moving_average(self.df[i], window)
                  e = len(self.df[i])  - len(x)
                  y = np.zeros(e//2) + np.nan
                  z = np.zeros(e-e//2) + np.nan
                  # create new column
                  self.df[new_column] = np.concatenate((y,x,z), axis=None)
              return (self.df[new_column_list])
            else:
                # raise error  for wrong window value 
              raise TypeError("""window = {0} is invalid. The argument: 
              lag only accepts positive integer. 
                                 \nExample: df.movingaverage(["var1],window = 3)""".format(window))
        else:
            # raise error  for wrong tsColnameList
            raise TypeError("""tsColnameList = {0} is invalid. The argument: tsColnameList only accepts a list of columns name.
                             \nExample: df.movingaverage(["var1],window = 3)""".format(tsColnameList))
  



# Subclass: **TextDataSet**
Public Member Functions
```
def 	__init__ (self, filename)
def 	clean (self, colname, lower=True, remove_diacritics=True, remove_punct=True, remove_stop=True, stem_tokens=False, lemmatize_tokens=False)
def 	token_list (nested_list)
def 	explore (token_list, wordcloud=True, dispersion=True)
def 	wordfrequency (token_list, n, ascending=True)
```

In [None]:
# Subclass: TextDataSet  
class TextDataSet(DataSet):
    
    def __init__(self, filename):
        """constructor for subclass, inherited from Dataset"""
        super().__init__(filename)
    
        DataSet._readFromCSV(self)
  
    class Clean:
        """
        Clean Class: 
        The clean class could clean the text 
        
        Parameters:
        lower : change text to lower case
        remove_diacritics: remove diacritics
        remove_punct: remove punctuation
        remove_stop: remove stopwords
        stem_tokens: stemming
        lemmatize_tokens: lemmatization
        
        """
       
        def __init__(self, lower=True, remove_diacritics=True, remove_punct=True, remove_stop=True, stem_tokens=False, lemmatize_tokens=False):
            
            self.lower = lower
            self.remove_diacritics = remove_diacritics
            self.stopwords  = set(nltk.corpus.stopwords.words('english'))
            self.remove_stop = remove_stop
            self.punct = set(string.punctuation)
            self.remove_punct = remove_punct 
            self.stemmer = PorterStemmer() 
            self.stem_tokens = stem_tokens
            self.lemmatizer = WordNetLemmatizer()
            self.lemmatize_tokens = lemmatize_tokens    
        
        def normalize_string(self, token):
            
            """normalize_string could normalize the string, change n't to not."""
            
            # Make the text to lower case if low is true
            if self.lower:
    
                token = token.lower() if self.lower else token
            
            # Replace non-spacing charactors with normalized tokens using NFD method
            if self.remove_diacritics:
                token = regex.sub("\p{Mn}",'',unicodedata.normalize('NFD',token)) 
    
            if token == "n't" and self.stopwords:
                token = "not"
          
            return token
        
        def is_punct(self,text):
            
            """is_punct could evaluate whether the text contains punctuation."""
            # Return true if there are punctuation or modifier
            if text in string.punctuation:
                return True
            if regex.match(r"[\p{P}\p{Mn}\p{Sk}]+", text):
                return True
            return False

        def tokenize(self, text):
            """tokenize could tokenize the text"""
            tokens = []
            # Tokenize text
            for token in word_tokenize(text):
                stem= ""
                # Normalize tokens
                token_text = self.normalize_string(token)       
                if self.remove_punct and self.is_punct(token_text):
                    continue
                if self.remove_stop and token_text in self.stopwords:
                    continue
                # Stemming
                if self.stem_tokens :
                    stem = self.stemmer.stem(token_text)
                # Lemmatizing 
                if self.lemmatize_tokens:
                    lemma = self.lemmatizer.lemmatize(token_text)
                
                # Append the tokens back to list
                if self.stem_tokens:
                     tokens.append(stem)
                elif self.lemmatize_tokens:
                    tokens.append(lemma)
                else:
                    tokens.append(token_text)  
                    
            return tokens
    
    
    def clean(self, colname, lower=True, remove_diacritics=True, remove_punct=True, remove_stop=True, stem_tokens=False, lemmatize_tokens=False):
        
        """
        clean Method:
        The clean method could clean a column of text and return a nested token list
                    
        Parameters:
        colname: the column you want to clean
        lower : change text to lower case
        remove_diacritics: remove diacritics
        remove_punct: remove punctuation
        remove_stop: remove stopwords
        stem_tokens: stemming
        lemmatize_tokens: lemmatization
        
        Returns:
        A nested list which can be covert to token list or dataframe
        
        """
        
        # Set the clean method
        clean_method=TextDataSet.Clean(lower=lower, remove_diacritics=remove_diacritics, remove_punct=remove_punct, remove_stop=remove_stop, stem_tokens=stem_tokens, lemmatize_tokens=lemmatize_tokens)
    
        # Clean the column of text and append the tokens back to list
        token_data = []
        for i in self.df[colname][:]:
            tokens = clean_method.tokenize(i)
            token_data.append(tokens)
        
        return token_data
    
    def token_list(nested_list):
        
        """The token_list could change the nested list to a flat list"""
        
        # Check whether the input is a nested list
        if any(isinstance(i, list) for i in nested_list) == False:
            raise TypeError("The nested_list is not nested.")
        
        # Make the nested list to flat list
        else:
            flat_list = []
            for sublist in nested_list:
                for item in sublist:
                    flat_list.append(item)
            return flat_list
    

    def explore(token_list, wordcloud=True, dispersion=True):
        
        """T
        explore Method:
        he explore could produce worldcloud and dispersion plot
        
        Parameters:
        token_list: a list of tokens
        wordcloud: produce wordcloud
        dispersion: produce dispersion plot
        
        Returns:
        wordcloud plot and dispersion plot
        
        """
        
        # Check wehter the input is a list
        if isinstance(token_list, list) == False:
            raise ValueError("The format of token_list is incorrect.")
        
        # Raise exception if both arributes are False
        elif wordcloud==False and dispersion==False:
            raise Exception("No plot will be made")
        
        # Make world cloud
        if wordcloud:
            my_string = (" ").join(token_list)
            my_wordcloud= WordCloud().generate(my_string)
    
            # Display the generated image:
            plt.imshow(my_wordcloud, interpolation='bilinear')
            plt.axis("off")
            plt.show()
        
        # Make dispersion plot 
        elif dispersion:
            search_words=["like", "love", "adore", "dislike", "hate", "abhor", "detest", "sickening"]
            dispersion_plot(token_list, search_words, ignore_case=True, title="Dispersion of emotional words")
       
        

    def wordfrequency(token_list,n,ascending= True):
        
        """
        wordfrequency Method:
        The wordfrequency could produce the wordfrequency dataframe
        
        Parameters:
        token_list: a list of tokens
        n: number of rows you want to check 
        ascending: the order of the wordfrequency
        
        Return:
        A wordfrequency dataframe 
    
        """
        
        # Check wehter the input is a list
        if isinstance(token_list, list) == False:
            raise ValueError("The format of token_list is incorrect.")
        
        # Check whether n is an integer
        elif isinstance(n, int) == False:
            raise ValueError("n should be an integer.")
    
        else:
            # count the words
            counts = Counter(token_list)
            
            # Make the counts to dataframe
            df = pd.DataFrame.from_dict(counts, orient='index').reset_index()
            df = df.rename(columns={'index':'words', 0:'frequency'})
            if ascending:
                df = df.sort_values('frequency',ascending=True)
            else:
                df = df.sort_values('frequency',ascending=False)
            print(df.head(n))


# subclass **QuantDataSet**

Public Member Functions
```
def 	__init__ (self, filename)
def 	clean (self)
def 	explore (self, plot_type, column1, column2=None)
def 	mean (self, column_name)
def 	median (self, column_name_median)
def 	mode (self, column_name_mode)
def 	order (self, column_name_order, decreasing)
```

In [None]:
# subclass QuantDataSet of our super class
class QuantDataSet(DataSet):
    """This is a class used for cleaning, plotting and doing some calculating work for a quantitative dataset"""

    def __init__(self, filename):
        """constructor for subclass, inherited from Dataset"""
        super().__init__(filename)
        DataSet._readFromCSV(self)

    def clean(self):
        """
        clean Method:
        Fill in missing values with the mean of its column

        Parameters:
        inherit the variable from the above

        Returns:
        Clean dataset
        """

        Quant_dataset = self.df
        # the number of columns
        count_row = Quant_dataset.shape[0]
        count_col = Quant_dataset.shape[1]
        colList = []
        rowList = []
        for indexs in range(count_row):
            for i in range(count_col):
                a = Quant_dataset.loc[indexs].values[i]
                # a=Quant_dataset.iloc(indexs,i)
                if pd.isna(a):
                    # catch the row number and column number
                    rowList.append(indexs)
                    colList.append(i)

        # fill in missing values with the mean of its column
        count_NA = len(rowList)
        for i in range(count_NA):
            Quant_dataset.iloc[rowList[i], colList[i]] = Quant_dataset[colList[i]].mean()
        print("Cleaning finished, we have filled all the missing values with the mean of its column")

    # the explore method
    def explore(self, plot_type, column1, column2=None):
        """G
        explore Method:
        ive a histogram or a scatter plot of a certain column or 2 columns

        Parameters:
        plot_type: the type of the plot
        column1 (string): the column will be used in the histogram or the scatter plot
        column2=None (string): the another column will be used in the scatter plot

        Returns:
        the histogram or the scatter plot
        """
        self.column1 = column1
        self.column2 = column2
        self.plot_type = plot_type
        Quant_dataset = self.df

        if plot_type == "hist":
            # Plot the hist of a certain column
            if self.column1 in Quant_dataset.columns.values:
                plt.figure(figsize=(8, 6), dpi=80)
                plt.hist(Quant_dataset[self.column1])
                plt.title(self.column1)
                plt.xlabel(self.column1)
                plt.ylabel('Count')
                plt.show()
            else:
                # raise error when input a wrong column name
                raise KeyError('The column name is incorrect!')
        elif plot_type == "scatter":
            # Plot the scatter plot to show the relationship between 2 certain columns
            if self.column1 and self.column2 in Quant_dataset.columns.values:
                plt.figure(figsize=(8, 6), dpi=80)
                plt.scatter(Quant_dataset[self.column1], Quant_dataset[self.column2])
                plt.title("The scatter plot")
                plt.xlabel(self.column1)
                plt.ylabel(self.column2)
                plt.show()
            else:
                # raise error when input wrong column names
                raise KeyError('The column name is incorrect!')
        else:
            # raise error when input wrong plot types
            raise ValueError('The plot type is incorrect! Please type in scatter or hist!')

    def mean(self, column_name):
        """
        mean Method:
        Return the mean of a certain column

        Parameter:
        column_name (string): the column to calculate the mean

        Returns:
        float: the mean of the certain column
        
        """
        Quant_dataset = self.df
        self.column_name = column_name
        # the mean of each column
        if self.column_name in Quant_dataset.columns.values:
            try:
                a = Quant_dataset[self.column_name]
                b = a.mean()
                print("the mean of the column " + self.column_name + " is", b)
            except (ValueError, TypeError):
                print("The column ", self.column_name, " is not numeric")
        else:
            # raise error when input wrong column names
            raise ValueError('The column name is incorrect!')

    def median(self, column_name_median):
        """
        median Method:
        Return the median of a certain column

        Parameter:
        column_name_median (string): the column to calculate the median

        Returns:
        float: the median of the certain column
        
        """
        Quant_dataset = self.df
        self.column_name_median = column_name_median
        # the median of each column
        if self.column_name_median in Quant_dataset.columns.values:
            try:
                a = Quant_dataset[self.column_name_median]
                b = a.median()
                print("the median of the column " + self.column_name_median + " is", b)
            except (ValueError, TypeError):
                print("The column ", self.column_name_median, " is not numeric")
        else:
            # raise error when input wrong column names
            raise ValueError('The column name is incorrect!')

    def mode(self, column_name_mode):
        """
        mode Method:
        Return the mode of a certain column

        Parameter:
        column_name_mode (string): the column to calculate the mode

        Returns:
        float: the mode of the certain column
        
        """
        Quant_dataset = self.df
        self.column_name_mode = column_name_mode
        # the mode of the column
        if self.column_name_mode in Quant_dataset.columns.values:
            try:
                a = Quant_dataset[self.column_name_mode]
                b = a.values.tolist()
                dict = {}
                for key in b:
                    dict[key] = dict.get(key, 0) + 1
                d_order = sorted(dict.items(), key=lambda x: x[1], reverse=True)
                mode = d_order[0][0]
                print("the mode of the column " + self.column_name_mode + " is", mode)
            except (ValueError, TypeError):
                print("The column ", self.column_name_mode, " is not numeric")
        else:
            # raise error when input wrong column names
            raise ValueError('The column name is incorrect!')

    def order(self, column_name_order, decreasing):
        """
        order Method:
        Present dataset by specific order

        Parameter:
        column_name_order (string): the column used to sort the dataset
        decreasing=False/True :the order type

        Returns:
        dataframe: the sorted dataset
        
        """   
        self.column_name_order = column_name_order
        self.decreasing = decreasing
        Quant_dataset = self.df

        if self.column_name_order in Quant_dataset.columns.values:
            if self.decreasing == True:
                decrease_df = Quant_dataset.sort_values(by=self.column_name_order, ascending=False)
                print(decrease_df)
            elif self.decreasing == False:
                increase_df = Quant_dataset.sort_values(by=self.column_name_order, ascending=True)
                print(increase_df)
            else:
                raise ValueError('The parameter decreasing is incorrect! Please input True or False')
        else:
            # raise error when input wrong column names
            raise ValueError('The column name is incorrect!')




# subclass **QualDataSet**

Public Member Functions
```
def 	__init__ (self, filename)
def 	clean (self)
def 	explore (self, plot_type_qual, column_qual)
def 	table (self, column_table)
```

In [None]:
# subclass QualDataSet of our super class
class QualDataSet(DataSet):
    """This is a class used for clean, plot and see the category frequency of a qualitative dataset"""

    def __init__(self, filename):
        """constructor for subclass, inherited from Dataset"""
        super().__init__(filename)
        DataSet._readFromCSV(self)

    def clean(self):
        """
        clean Method:  
        Fill in missing values with the mode of its column

        Parameter:
        inherited from the Dataset class

        Return:
        the cleaned dataset
        """
        Qual_dataset = self.df
        # the number of columns and rows
        count_row = Qual_dataset.shape[0]
        count_col = Qual_dataset.shape[1]
        colList = []
        rowList = []
        for indexs in range(count_row):
            for i in range(count_col):
                a = Qual_dataset.loc[indexs].values[i]
                if pd.isna(a):
                    # catch the row number and column number of NAs
                    rowList.append(indexs)
                    colList.append(i)
        # fill in missing values with the mode of its column
        count_NA = len(rowList)
        for i in range(count_NA):
            a = Qual_dataset[Qual_dataset.columns.values[colList[i]]]
            b = a.values.tolist()
            dict = {}
            for key in b:
                dict[key] = dict.get(key, 0) + 1
            d_order = sorted(dict.items(), key=lambda x: x[1], reverse=True)
            mode = d_order[0][0]
            if pd.isna(mode):
                mode = d_order[1][0]
            Qual_dataset.iloc[rowList[i], colList[i]] = mode
        print("Cleaning finished, we have filled all the missing values with the mode of its column")

    def explore(self, plot_type_qual, column_qual):
        """
        explore Method:
        Give a histogram or a pie chart of a certain column

        Parameters:
        plot_type_qual (string): the plot type --pie or hist
        column_qual (string): the column used for create the plot

        Return:
        a pie chart or a histogram
        """
        
        Qual_dataset = self.df
        self.column_qual = column_qual
        self.plot_type_qual = plot_type_qual

        if self.plot_type_qual == "hist":
            # give a histogram of a certain column to show the frequency of each category
            if column_qual in Qual_dataset.columns.values:
                try:
                    plt.figure(figsize=(8, 6), dpi=80)
                    plt.hist(Qual_dataset[self.column_qual])
                    plt.title(self.column_qual)
                    plt.xlabel(self.column_qual)
                    plt.ylabel('Frequency')
                    plt.show()
                except(ValueError, TypeError):
                    print("The column ", self.plot_type_qual, "has NA values, please clean it")
            else:
                # raise error when input a wrong column name
                raise KeyError('The column name is incorrect!')

        elif self.plot_type_qual == "pie":
            # give a pie chart of a certain column to show the proportion of each category
            if column_qual in Qual_dataset.columns.values:
                a = Qual_dataset[self.column_qual]
                b = a.values.tolist()
                dict = {}
                for key in b:
                    dict[key] = dict.get(key, 0) + 1
                labels = []
                values = []
                for i in dict.keys():
                    labels.append(i)
                for j in dict.values():
                    values.append(j)
                fig = plt.figure()
                plt.pie(values, labels=labels, autopct='%1.2f%%')
                plt.title("Pie Chart")
                plt.show()
            else:
                # raise error when input a wrong column name
                raise KeyError('The column name is incorrect!')
        else:
            # raise error when input a wrong plot type
            raise ValueError('The plot type is incorrect! Please type in pie or hist')

    def table(self, column_table):
        """
        table Method:
            
        Show the frequency of each category for a certain column

        Parameter:
        column_table(string): the column to be tabled

        Return:
        dictionary: the frequency of each category in the certain column 
        
        """
        Qual_dataset = self.df
        self.column_table = column_table

        if column_table in Qual_dataset.columns.values:
            a = Qual_dataset[self.column_table]
            b = a.values.tolist()
            dict = {}
            for key in b:
                dict[key] = dict.get(key, 0) + 1
            return (dict)
        else:
            # raise error when input a wrong column name
            raise KeyError('The column name is incorrect!')




